From eb881ed006ca458b052905e33f0d16dbb428063a Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 27 Dec 2024 11:05:08 +0800
Subject: [PATCH 01/48] [misc] fix typing (#11540)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/backends.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 826d1744d88a5..4f960b441f21d 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -208,8 +208,8 @@ def wrap_inductor(graph: fx.GraphModule,
         from torch._inductor.compile_fx import graph_returns_tuple
         returns_tuple = graph_returns_tuple(graph)
 
-        # this is the graph we return to Dynamo to run
-        def compiled_graph(*args) -> Optional[fx.CompiledFxGraph]:
+        # this is the callable we return to Dynamo to run
+        def compiled_graph(*args):
             # convert args to list
             list_args = list(args)
             graph_output = inductor_compiled_graph(list_args)
@@ -537,7 +537,8 @@ class VllmBackend:
             example_inputs[x].clone() for x in self.sym_tensor_indices
         ]
 
-        def copy_and_call(*args) -> fx.GraphModule:
+        # this is the callable we return to Dynamo to run
+        def copy_and_call(*args):
             list_args = list(args)
             for i, index in enumerate(self.sym_tensor_indices):
                 runtime_tensor = list_args[index]

From 1b875a0ef3767a7da7507943f46e4a53d3f552c9 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 27 Dec 2024 00:19:21 -0500
Subject: [PATCH 02/48] [V1][3/N] API Server: Reduce Task Switching + Handle
 Abort Properly (#11534)

---
 vllm/v1/engine/async_llm.py    | 151 +++++++++++++--------------------
 vllm/v1/engine/async_stream.py |  55 ------------
 vllm/v1/engine/core.py         |   2 +-
 3 files changed, 59 insertions(+), 149 deletions(-)
 delete mode 100644 vllm/v1/engine/async_stream.py

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index cfdbea8004c35..ba2b8377759d6 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -9,14 +9,13 @@ from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.outputs import PoolingRequestOutput, RequestOutput
+from vllm.outputs import RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
-from vllm.v1.engine.async_stream import AsyncStream
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.processor import Processor
@@ -54,10 +53,8 @@ class AsyncLLM(EngineClient):
             lora_config=vllm_config.lora_config)
         self.tokenizer.ping()
 
-        # Request streams (map of request_id -> AsyncStream).
-        self.request_streams: Dict[str, AsyncStream] = {}
-        # List of cancelled request ids to be aborted.
-        self.client_aborted_requests: List[str] = []
+        # Request streams (map of request_id -> queue).
+        self.rid_to_queue: Dict[str, asyncio.Queue] = {}
 
         # Processor (converts Inputs --> EngineCoreRequests).
         self.processor = Processor(
@@ -153,14 +150,13 @@ class AsyncLLM(EngineClient):
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
-    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
+    ) -> asyncio.Queue[RequestOutput]:
         """Add new request to the AsyncLLM."""
 
-        if self.detokenizer.is_request_active(request_id):
-            raise ValueError(f"Request {request_id} already exists.")
-
-        # 1) Create a new AsyncStream for the request.
-        stream = self._add_request_to_streams(request_id)
+        # 1) Create a new output queue for the request.
+        if request_id in self.rid_to_queue:
+            raise ValueError(f"Request id {request_id} already running.")
+        self.rid_to_queue[request_id] = asyncio.Queue()
 
         # 2) Convert input --> DetokenizerRequest / EngineCoreRequest.
         detokenizer_req, engine_core_req = self.processor.process_inputs(
@@ -173,8 +169,10 @@ class AsyncLLM(EngineClient):
         # 4) Add the EngineCoreRequest to EngineCore (separate process).
         await self.engine_core.add_request_async(engine_core_req)
 
-        # 5) Return the generator.
-        return stream.generator()
+        if self.log_requests:
+            logger.info("Added request %s.", request_id)
+
+        return self.rid_to_queue[request_id]
 
     # TODO: we should support multiple prompts in one call, as you
     # can do with LLM.generate. So that for multi-prompt completion
@@ -194,7 +192,7 @@ class AsyncLLM(EngineClient):
         """
         Main function called by the API server to kick off a request
             * 1) Making an AsyncStream corresponding to the Request.
-            # 2) Processing the Input.
+            * 2) Processing the Input.
             * 3) Adding the Request to the Detokenizer.
             * 4) Adding the Request to the EngineCore (separate process).
 
@@ -206,14 +204,15 @@ class AsyncLLM(EngineClient):
         returning the RequestOutput back to the caller.
         """
 
-        # We start the output_handler on the first call to generate() so that
-        # we can call __init__ before the event loop starts, which enables us
-        # to handle startup failure gracefully in the OpenAI server.
-        if self.output_handler is None:
-            self.output_handler = asyncio.create_task(
-                self._run_output_handler())
+        try:
+            # We start the output_handler on the first call to generate() so
+            # we can call __init__ before the event loop, which enables us
+            # to handle startup failure gracefully in the OpenAI server.
+            if self.output_handler is None:
+                self.output_handler = asyncio.create_task(
+                    self._run_output_handler())
 
-        async for output in await self.add_request(
+            q = await self.add_request(
                 request_id,
                 prompt,
                 sampling_params,
@@ -221,79 +220,42 @@ class AsyncLLM(EngineClient):
                 trace_headers=trace_headers,
                 prompt_adapter_request=prompt_adapter_request,
                 priority=priority,
-        ):
-            yield output
+            )
 
-    def _finish_stream(self, request_id: str):
-        stream = self.request_streams.pop(request_id, None)
-        if stream is not None:
-            stream.finish()
+            # The output_handler task pushes items into the queue.
+            # This task pulls from the queue and yields to caller.
+            while True:
+                # Note: drain queue without await if possible (avoids
+                # task switching under load which helps performance).
+                out = q.get_nowait() if q.qsize() > 0 else await q.get()
 
-    def _add_request_to_streams(
-        self,
-        request_id: str,
-    ) -> AsyncStream:
+                # Note: both Detokenizer and EngineCore handle their
+                # own request cleanup based on finished.
+                if out.finished:
+                    del self.rid_to_queue[request_id]
+                    yield out
+                    break
 
-        if request_id in self.request_streams:
-            raise ValueError(f"Request id {request_id} already running.")
+                yield out
 
-        # Avoid streams having circular ref to parent AsyncLLM object.
-        aborted_reqs = self.client_aborted_requests
-        stream = AsyncStream(request_id, aborted_reqs.append)
-        self.request_streams[request_id] = stream
-
-        if self.log_requests:
-            logger.info("Added request %s.", request_id)
-
-        return stream
-
-    async def _process_cancellations(self) -> None:
-        """
-        Process requests cancelled from user disconnecting.
-
-        When a client disconnects, AsyncStream._cancel() is called.
-        We passed a callback to AsyncStream(), which appends to 
-        self.client_aborted_requests.
-
-        As a result, if any requests are canceled from the user side
-        the request_id will show up in self.client_aborted_requests.
-        """
-
-        # Avoid streams having circular ref to parent AsyncLLM object.
-        if not self.client_aborted_requests:
-            return
-        reqs_to_abort = self.client_aborted_requests.copy()
-        self.client_aborted_requests.clear()
-
-        # Remove from Detokenizer.
-        self.detokenizer.abort_requests(reqs_to_abort)
-
-        # Remove from RequestStreams.
-        for request_id in reqs_to_abort:
-            if self.log_requests:
-                logger.info("User-cancelled request %s.", request_id)
-            self._finish_stream(request_id)
-
-        # Remove from EngineCore.
-        await self.engine_core.abort_requests_async(reqs_to_abort)
+        # If the request is disconnected by the client, the
+        # generate() task will be canceled. So, we abort the
+        # request if we end up here.
+        except asyncio.CancelledError:
+            await self.abort(request_id)
+            raise
 
     def _process_request_outputs(self, request_outputs: List[RequestOutput]):
-        """Process outputs by putting them into per-request AsyncStreams."""
+        """Process outputs by putting them into per-request queues."""
 
         for request_output in request_outputs:
             request_id = request_output.request_id
-            assert request_id in self.request_streams
 
-            # Each request in the API server pulls from the per-request stream.
-            stream = self.request_streams.get(request_id)
-            if stream is not None:
-                stream.put(request_output)
-
-                # If finished, remove from the tracker.
-                if request_output.finished:
-                    if self.log_requests:
-                        logger.info("Finished request %s.", request_id)
-                    self._finish_stream(request_id)
+            # Note: it is possible a request was aborted and removed from
+            # the state due to client cancellations, so if we encounter a
+            # request id not in the state, we skip.
+            if request_id in self.rid_to_queue:
+                self.rid_to_queue[request_id].put_nowait(request_output)
 
     async def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
@@ -306,24 +268,27 @@ class AsyncLLM(EngineClient):
                 # 2) Detokenize based on the output.
                 request_outputs, reqs_to_abort = self.detokenizer.step(outputs)
 
-                # 3) Put the RequestOutputs into the per-request AsyncStreams.
+                # 3) Put the RequestOutputs into the per-request queues.
                 self._process_request_outputs(request_outputs)
 
                 # 4) Abort any requests that finished due to stop strings.
                 await self.engine_core.abort_requests_async(reqs_to_abort)
 
-                # 5) Abort any requests due to client cancellations.
-                await self._process_cancellations()
-
         except BaseException as e:
             logger.error(e)
             raise e
 
-    # TODO: can we eliminate these?
-
     async def abort(self, request_id: str) -> None:
-        # Note: Who Calls this? I dont think this is actually used.
-        raise ValueError("Not Supported on V1 yet.")
+        """Abort RequestId in self, detokenizer, and engine core."""
+
+        request_ids = [request_id]
+        await self.engine_core.abort_requests_async(request_ids)
+        self.detokenizer.abort_requests(request_ids)
+
+        # If a request finishes while we await then the request_id
+        # will be removed from the tracked queues before we get here.
+        if request_id in self.rid_to_queue:
+            del self.rid_to_queue[request_id]
 
     def encode(
         self,
diff --git a/vllm/v1/engine/async_stream.py b/vllm/v1/engine/async_stream.py
deleted file mode 100644
index 35449238c3259..0000000000000
--- a/vllm/v1/engine/async_stream.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import asyncio
-from typing import Any, AsyncGenerator, Callable, Optional, Type, Union
-
-from vllm.outputs import PoolingRequestOutput, RequestOutput
-
-
-class AsyncStream:
-    """A stream of RequestOutputs or PoolingRequestOutputs for a request
-    that can be iterated over asynchronously via an async generator."""
-
-    STOP_ITERATION = Exception()  # Sentinel
-
-    def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
-        self.request_id = request_id
-        self._cancel = cancel
-        self._queue: asyncio.Queue = asyncio.Queue()
-        self._finished = False
-
-    def put(self, item: Union[RequestOutput, PoolingRequestOutput,
-                              Exception]) -> None:
-        if not self._finished:
-            self._queue.put_nowait(item)
-
-    def finish(
-        self,
-        exception: Optional[Union[BaseException, Type[BaseException]]] = None,
-    ) -> None:
-        if not self._finished:
-            self._finished = True
-            self._queue.put_nowait(exception if self._is_raisable(exception)
-                                   else AsyncStream.STOP_ITERATION)
-
-    async def generator(
-        self
-    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
-        finished = False
-        try:
-            while True:
-                result = await self._queue.get()
-                if self._is_raisable(result):
-                    finished = True
-                    if result == AsyncStream.STOP_ITERATION:
-                        return
-                    raise result
-                yield result
-        finally:
-            self._finished = True
-            if not finished:
-                self._cancel(self.request_id)
-
-    @staticmethod
-    def _is_raisable(value: Any):
-        return isinstance(value, BaseException) or \
-                (isinstance(value, type) and \
-                 issubclass(value, BaseException))
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 497d5db5b4c99..0aef61fc7f680 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -32,7 +32,7 @@ logger = init_logger(__name__)
 
 POLLING_TIMEOUT_MS = 5000
 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
-LOGGING_TIME_S = 5000
+LOGGING_TIME_S = POLLING_TIMEOUT_S
 
 
 class EngineCore:

From 2339d59f9260499599d60599f83978fad1827999 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 27 Dec 2024 01:23:29 -0500
Subject: [PATCH 03/48] [BugFix] Fix quantization for all other methods
 (#11547)

---
 vllm/model_executor/layers/fused_moe/layer.py | 19 ++++++++++++----
 .../layers/quantization/awq_marlin.py         | 10 ++++++---
 .../compressed_tensors_moe.py                 | 22 +++++++++++++------
 .../layers/quantization/experts_int8.py       | 10 ++++++---
 .../model_executor/layers/quantization/fp8.py |  3 +--
 .../layers/quantization/gptq_marlin.py        | 10 ++++++---
 6 files changed, 52 insertions(+), 22 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 01ffac4550f28..b108cbd52c218 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -41,9 +41,20 @@ class FusedMoEMethodBase(QuantizeMethodBase):
         raise NotImplementedError
 
     @abstractmethod
-    def apply(self, layer: torch.nn.Module, x: torch.Tensor,
-              router_logits: torch.Tensor, top_k: int, renormalize: bool,
-              use_grouped_topk: bool) -> torch.Tensor:
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
         raise NotImplementedError
 
 
@@ -79,7 +90,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         router_logits: torch.Tensor,
         top_k: int,
         renormalize: bool,
-        use_grouped_topk: bool,
+        use_grouped_topk: bool = False,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 4d1a837d11585..c28fd0c6737e0 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -440,11 +440,13 @@ class AWQMoEMethod(FusedMoEMethodBase):
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
-        renormalize: bool = True,
+        renormalize: bool,
         use_grouped_topk: bool = False,
-        num_expert_group: Optional[int] = None,
         topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
@@ -454,7 +456,9 @@ class AWQMoEMethod(FusedMoEMethodBase):
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function)
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
 
         return torch.ops.vllm.fused_marlin_moe(
             x,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index dad04017d3212..5fd6b017f444b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -203,13 +203,14 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
-        renormalize: bool = True,
+        renormalize: bool,
         use_grouped_topk: bool = False,
-        num_expert_group: Optional[int] = None,
         topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-
         from vllm.model_executor.layers.fused_moe import fused_experts
 
         topk_weights, topk_ids = FusedMoE.select_experts(
@@ -220,7 +221,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function)
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
 
         return fused_experts(x,
                              layer.w13_weight,
@@ -476,12 +479,15 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
-        renormalize: bool = True,
+        renormalize: bool,
         use_grouped_topk: bool = False,
-        num_expert_group: Optional[int] = None,
         topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
@@ -490,7 +496,9 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function)
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
 
         return torch.ops.vllm.fused_marlin_moe(
             x,
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 97297970d9317..209f12c6dfec9 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -99,11 +99,13 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
-        renormalize: bool = True,
+        renormalize: bool,
         use_grouped_topk: bool = False,
-        num_expert_group: Optional[int] = None,
         topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
@@ -115,7 +117,9 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function)
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
 
         return fused_experts(x,
                              layer.w13_weight,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 4362468c1db69..7f779ac8d3b3e 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -601,14 +601,13 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         router_logits: torch.Tensor,
         top_k: int,
         renormalize: bool,
-        use_grouped_topk: bool,
+        use_grouped_topk: bool = False,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-
         from vllm.model_executor.layers.fused_moe import fused_experts
 
         topk_weights, topk_ids = FusedMoE.select_experts(
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index a3e58bf1b2a4c..a006d729cc627 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -532,11 +532,13 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
-        renormalize: bool = True,
+        renormalize: bool,
         use_grouped_topk: bool = False,
-        num_expert_group: Optional[int] = None,
         topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         # The input must currently be float16
         orig_dtype = x.dtype
@@ -550,7 +552,9 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
-            custom_routing_function=None)
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
 
         return torch.ops.vllm.fused_marlin_moe(
             x,

From 6c6f7fe8a850ca08f9a8774de020163a2a7c2164 Mon Sep 17 00:00:00 2001
From: Mengqing Cao <cmq0113@163.com>
Date: Fri, 27 Dec 2024 16:45:25 +0800
Subject: [PATCH 04/48] [Platform] Move model arch check to platform (#11503)

Signed-off-by: Mengqing Cao <cmq0113@163.com>
---
 vllm/model_executor/models/registry.py | 37 +-----------------------
 vllm/platforms/interface.py            | 12 ++++++++
 vllm/platforms/rocm.py                 | 39 +++++++++++++++++++++++++-
 3 files changed, 51 insertions(+), 37 deletions(-)

diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index feb33bb373c3e..89992de7e238d 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -187,31 +187,6 @@ _VLLM_MODELS = {
     **_SPECULATIVE_DECODING_MODELS,
 }
 
-# Models not supported by ROCm.
-_ROCM_UNSUPPORTED_MODELS: List[str] = []
-
-# Models partially supported by ROCm.
-# Architecture -> Reason.
-_ROCM_SWA_REASON = ("Sliding window attention (SWA) is not yet supported in "
-                    "Triton flash attention. For half-precision SWA support, "
-                    "please use CK flash attention by setting "
-                    "`VLLM_USE_TRITON_FLASH_ATTN=0`")
-_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
-    "Qwen2ForCausalLM":
-    _ROCM_SWA_REASON,
-    "MistralForCausalLM":
-    _ROCM_SWA_REASON,
-    "MixtralForCausalLM":
-    _ROCM_SWA_REASON,
-    "PaliGemmaForConditionalGeneration":
-    ("ROCm flash attention does not yet "
-     "fully support 32-bit precision on PaliGemma"),
-    "Phi3VForCausalLM":
-    ("ROCm Triton flash attention may run into compilation errors due to "
-     "excessive use of shared memory. If this happens, disable Triton FA "
-     "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
-}
-
 
 @dataclass(frozen=True)
 class _ModelInfo:
@@ -297,17 +272,7 @@ def _try_load_model_cls(
     model_arch: str,
     model: _BaseRegisteredModel,
 ) -> Optional[Type[nn.Module]]:
-    if current_platform.is_rocm():
-        if model_arch in _ROCM_UNSUPPORTED_MODELS:
-            raise ValueError(f"Model architecture '{model_arch}' is not "
-                             "supported by ROCm for now.")
-
-        if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
-            msg = _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch]
-            logger.warning(
-                "Model architecture '%s' is partially "
-                "supported by ROCm: %s", model_arch, msg)
-
+    current_platform.verify_model_arch(model_arch)
     try:
         return model.load_model_cls()
     except Exception:
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 4150b0cdf836a..ddccaa2ce0148 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -199,6 +199,18 @@ class Platform:
         """
         pass
 
+    @classmethod
+    def verify_model_arch(cls, model_arch: str) -> None:
+        """
+        Verify whether the current platform supports the specified model
+        architecture.
+
+        - This will raise an Error or Warning based on the model support on
+        the current platform.
+        - By default all models are considered supported.
+        """
+        pass
+
     @classmethod
     def verify_quantization(cls, quant: str) -> None:
         """
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 7778b565372cb..aa779f265135f 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -1,6 +1,6 @@
 import os
 from functools import lru_cache
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Dict, List, Optional
 
 import torch
 
@@ -33,6 +33,31 @@ if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) in ["fork", None]:
                    " `spawn` instead.")
     os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
+# Models not supported by ROCm.
+_ROCM_UNSUPPORTED_MODELS: List[str] = []
+
+# Models partially supported by ROCm.
+# Architecture -> Reason.
+_ROCM_SWA_REASON = ("Sliding window attention (SWA) is not yet supported in "
+                    "Triton flash attention. For half-precision SWA support, "
+                    "please use CK flash attention by setting "
+                    "`VLLM_USE_TRITON_FLASH_ATTN=0`")
+_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
+    "Qwen2ForCausalLM":
+    _ROCM_SWA_REASON,
+    "MistralForCausalLM":
+    _ROCM_SWA_REASON,
+    "MixtralForCausalLM":
+    _ROCM_SWA_REASON,
+    "PaliGemmaForConditionalGeneration":
+    ("ROCm flash attention does not yet "
+     "fully support 32-bit precision on PaliGemma"),
+    "Phi3VForCausalLM":
+    ("ROCm Triton flash attention may run into compilation errors due to "
+     "excessive use of shared memory. If this happens, disable Triton FA "
+     "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
+}
+
 
 class RocmPlatform(Platform):
     _enum = PlatformEnum.ROCM
@@ -102,6 +127,18 @@ class RocmPlatform(Platform):
             else:
                 parallel_config.worker_cls = "vllm.worker.worker.Worker"
 
+    @classmethod
+    def verify_model_arch(cls, model_arch: str) -> None:
+        if model_arch in _ROCM_UNSUPPORTED_MODELS:
+            raise ValueError(f"Model architecture '{model_arch}' is not "
+                             "supported by ROCm for now.")
+
+        if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
+            msg = _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch]
+            logger.warning(
+                "Model architecture '%s' is partially "
+                "supported by ROCm: %s", model_arch, msg)
+
     @classmethod
     def verify_quantization(cls, quant: str) -> None:
         super().verify_quantization(quant)

From d003f3ea391b4c879f6f848dd485dd3c04fa6ca9 Mon Sep 17 00:00:00 2001
From: AlexHe99 <alehe@amd.com>
Date: Fri, 27 Dec 2024 18:00:04 +0800
Subject: [PATCH 05/48] Update deploying_with_k8s.md with AMD ROCm GPU example
 (#11465)

Signed-off-by: Alex He <alehe@amd.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/source/serving/deploying_with_k8s.md | 79 ++++++++++++++++++++++-
 1 file changed, 78 insertions(+), 1 deletion(-)

diff --git a/docs/source/serving/deploying_with_k8s.md b/docs/source/serving/deploying_with_k8s.md
index d27db826cd006..77f848088ea43 100644
--- a/docs/source/serving/deploying_with_k8s.md
+++ b/docs/source/serving/deploying_with_k8s.md
@@ -47,7 +47,11 @@ data:
   token: "REPLACE_WITH_TOKEN"
 ```
 
-Create a deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model:
+Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model.
+
+Here are two examples for using NVIDIA GPU and AMD GPU. 
+
+- NVIDIA GPU
 
 ```yaml
 apiVersion: apps/v1
@@ -119,6 +123,79 @@ spec:
           periodSeconds: 5
 ```
 
+- AMD GPU
+
+You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X.
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: mistral-7b
+  namespace: default
+  labels:
+    app: mistral-7b
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: mistral-7b
+  template:
+    metadata:
+      labels:
+        app: mistral-7b
+    spec:
+      volumes:
+      # PVC
+      - name: cache-volume
+        persistentVolumeClaim:
+          claimName: mistral-7b
+      # vLLM needs to access the host's shared memory for tensor parallel inference.
+      - name: shm
+        emptyDir:
+          medium: Memory
+          sizeLimit: "8Gi"
+      hostNetwork: true
+      hostIPC: true
+      containers:
+      - name: mistral-7b
+        image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+        securityContext:
+          seccompProfile:
+            type: Unconfined
+          runAsGroup: 44
+          capabilities:
+            add:
+            - SYS_PTRACE
+        command: ["/bin/sh", "-c"]
+        args: [
+          "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
+        ]
+        env:
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: hf-token-secret
+              key: token
+        ports:
+        - containerPort: 8000
+        resources:
+          limits:
+            cpu: "10"
+            memory: 20G
+            amd.com/gpu: "1"
+          requests:
+            cpu: "6"
+            memory: 6G
+            amd.com/gpu: "1"
+        volumeMounts:
+        - name: cache-volume
+          mountPath: /root/.cache/huggingface
+        - name: shm
+          mountPath: /dev/shm
+```
+You can get the full example with steps and sample yaml files from <https://github.com/ROCm/k8s-device-plugin/tree/master/example/vllm-serve>.
+
 2. **Create a Kubernetes Service for vLLM**
 
 Next, create a Kubernetes Service file to expose the `mistral-7b` deployment:

From 2c9b8ea2b006e763b8268b8ab02181c9822cfe76 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 27 Dec 2024 18:39:15 +0800
Subject: [PATCH 06/48] [Bugfix] Fix TeleChat2ForCausalLM weights mapper
 (#11546)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/models/telechat2.py | 26 ++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py
index 28c37bb96612c..02ca7fe08e556 100644
--- a/vllm/model_executor/models/telechat2.py
+++ b/vllm/model_executor/models/telechat2.py
@@ -31,19 +31,6 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
 
 class TeleChat2Model(LlamaModel):
 
-    hf_to_vllm_mapper = WeightsMapper(
-        orig_to_new_prefix={
-            "transformer.": "model.",
-        },
-        orig_to_new_substr={
-            ".h.": ".layers.",
-            ".self_attention.": ".self_attn.",
-            ".word_embeddings.": ".embed_tokens.",
-            ".dense.": ".o_proj.",
-            ".ln_f.": ".norm.",
-        },
-    )
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # 1. Initialize the LlamaModel with bias
         vllm_config.model_config.hf_config.bias = True
@@ -118,6 +105,19 @@ class TeleChat2Model(LlamaModel):
 
 class TeleChat2ForCausalLM(LlamaForCausalLM):
 
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "transformer.": "model.",
+        },
+        orig_to_new_substr={
+            ".h.": ".layers.",
+            ".self_attention.": ".self_attn.",
+            ".word_embeddings.": ".embed_tokens.",
+            ".dense.": ".o_proj.",
+            ".ln_f.": ".norm.",
+        },
+    )
+
     def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
         return TeleChat2Model(vllm_config=vllm_config, prefix=prefix)
 

From 7af553ea30031446b4c1c74ad83187f9fd3de4e7 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 27 Dec 2024 19:21:23 +0800
Subject: [PATCH 07/48] [Misc] Abstract the logic for reading and writing media
 content (#11527)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/entrypoints/openai/test_serving_chat.py |   1 +
 tests/entrypoints/test_chat_utils.py          |   6 +-
 tests/multimodal/test_utils.py                |  59 ++-
 vllm/assets/audio.py                          |   6 +-
 vllm/entrypoints/chat_utils.py                | 129 +++--
 vllm/multimodal/audio.py                      |  36 +-
 vllm/multimodal/base.py                       |  38 +-
 vllm/multimodal/image.py                      |  41 +-
 vllm/multimodal/utils.py                      | 481 ++++++++----------
 vllm/multimodal/video.py                      |  87 +++-
 10 files changed, 495 insertions(+), 389 deletions(-)

diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 51b255bb2a6db..61677b65af342 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -33,6 +33,7 @@ class MockModelConfig:
     hf_config = MockHFConfig()
     logits_processor_pattern = None
     diff_sampling_param: Optional[dict] = None
+    allowed_local_media_path: str = ""
 
     def get_diff_sampling_param(self):
         return self.diff_sampling_param or {}
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 996e60bfee592..d63b963522e73 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -2,7 +2,6 @@ import warnings
 from typing import Optional
 
 import pytest
-from PIL import Image
 
 from vllm.assets.image import ImageAsset
 from vllm.config import ModelConfig
@@ -91,10 +90,7 @@ def _assert_mm_data_is_image_input(
     image_data = mm_data.get("image")
     assert image_data is not None
 
-    if image_count == 1:
-        assert isinstance(image_data, Image.Image)
-    else:
-        assert isinstance(image_data, list) and len(image_data) == image_count
+    assert isinstance(image_data, list) and len(image_data) == image_count
 
 
 def test_parse_chat_messages_single_image(
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index fd82fb0c55fd7..6029f2e514772 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -9,7 +9,7 @@ import pytest
 from PIL import Image, ImageChops
 from transformers import AutoConfig, AutoTokenizer
 
-from vllm.multimodal.utils import (async_fetch_image, fetch_image,
+from vllm.multimodal.utils import (MediaConnector,
                                    repeat_and_pad_placeholder_tokens)
 
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
@@ -23,7 +23,12 @@ TEST_IMAGE_URLS = [
 
 @pytest.fixture(scope="module")
 def url_images() -> Dict[str, Image.Image]:
-    return {image_url: fetch_image(image_url) for image_url in TEST_IMAGE_URLS}
+    connector = MediaConnector()
+
+    return {
+        image_url: connector.fetch_image(image_url)
+        for image_url in TEST_IMAGE_URLS
+    }
 
 
 def get_supported_suffixes() -> Tuple[str, ...]:
@@ -43,8 +48,10 @@ def _image_equals(a: Image.Image, b: Image.Image) -> bool:
 @pytest.mark.asyncio
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 async def test_fetch_image_http(image_url: str):
-    image_sync = fetch_image(image_url)
-    image_async = await async_fetch_image(image_url)
+    connector = MediaConnector()
+
+    image_sync = connector.fetch_image(image_url)
+    image_async = await connector.fetch_image_async(image_url)
     assert _image_equals(image_sync, image_async)
 
 
@@ -53,6 +60,7 @@ async def test_fetch_image_http(image_url: str):
 @pytest.mark.parametrize("suffix", get_supported_suffixes())
 async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
                                   image_url: str, suffix: str):
+    connector = MediaConnector()
     url_image = url_images[image_url]
 
     try:
@@ -75,48 +83,49 @@ async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
         base64_image = base64.b64encode(f.read()).decode("utf-8")
         data_url = f"data:{mime_type};base64,{base64_image}"
 
-        data_image_sync = fetch_image(data_url)
+        data_image_sync = connector.fetch_image(data_url)
         if _image_equals(url_image, Image.open(f)):
             assert _image_equals(url_image, data_image_sync)
         else:
             pass  # Lossy format; only check that image can be opened
 
-        data_image_async = await async_fetch_image(data_url)
+        data_image_async = await connector.fetch_image_async(data_url)
         assert _image_equals(data_image_sync, data_image_async)
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 async def test_fetch_image_local_files(image_url: str):
+    connector = MediaConnector()
+
     with TemporaryDirectory() as temp_dir:
-        origin_image = fetch_image(image_url)
+        local_connector = MediaConnector(allowed_local_media_path=temp_dir)
+
+        origin_image = connector.fetch_image(image_url)
         origin_image.save(os.path.join(temp_dir, os.path.basename(image_url)),
                           quality=100,
                           icc_profile=origin_image.info.get('icc_profile'))
 
-        image_async = await async_fetch_image(
-            f"file://{temp_dir}/{os.path.basename(image_url)}",
-            allowed_local_media_path=temp_dir)
-
-        image_sync = fetch_image(
-            f"file://{temp_dir}/{os.path.basename(image_url)}",
-            allowed_local_media_path=temp_dir)
+        image_async = await local_connector.fetch_image_async(
+            f"file://{temp_dir}/{os.path.basename(image_url)}")
+        image_sync = local_connector.fetch_image(
+            f"file://{temp_dir}/{os.path.basename(image_url)}")
         # Check that the images are equal
         assert not ImageChops.difference(image_sync, image_async).getbbox()
 
-        with pytest.raises(ValueError):
-            await async_fetch_image(
-                f"file://{temp_dir}/../{os.path.basename(image_url)}",
-                allowed_local_media_path=temp_dir)
-        with pytest.raises(ValueError):
-            await async_fetch_image(
+        with pytest.raises(ValueError, match="must be a subpath"):
+            await local_connector.fetch_image_async(
+                f"file://{temp_dir}/../{os.path.basename(image_url)}")
+        with pytest.raises(RuntimeError, match="Cannot load local files"):
+            await connector.fetch_image_async(
                 f"file://{temp_dir}/../{os.path.basename(image_url)}")
 
-        with pytest.raises(ValueError):
-            fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}",
-                        allowed_local_media_path=temp_dir)
-        with pytest.raises(ValueError):
-            fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}")
+        with pytest.raises(ValueError, match="must be a subpath"):
+            local_connector.fetch_image(
+                f"file://{temp_dir}/../{os.path.basename(image_url)}")
+        with pytest.raises(RuntimeError, match="Cannot load local files"):
+            connector.fetch_image(
+                f"file://{temp_dir}/../{os.path.basename(image_url)}")
 
 
 @pytest.mark.parametrize("model", ["llava-hf/llava-v1.6-mistral-7b-hf"])
diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py
index 9033644e3264a..a46c67ad7e00e 100644
--- a/vllm/assets/audio.py
+++ b/vllm/assets/audio.py
@@ -21,12 +21,10 @@ class AudioAsset:
     name: Literal["winning_call", "mary_had_lamb"]
 
     @property
-    def audio_and_sample_rate(self) -> tuple[npt.NDArray, int]:
+    def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]:
         audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg",
                                             s3_prefix=ASSET_DIR)
-        y, sr = librosa.load(audio_path, sr=None)
-        assert isinstance(sr, int)
-        return y, sr
+        return librosa.load(audio_path, sr=None)
 
     @property
     def url(self) -> str:
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 3df08c740d65b..a492d5496e025 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -6,7 +6,7 @@ from collections import defaultdict, deque
 from functools import lru_cache, partial
 from pathlib import Path
 from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List,
-                    Literal, Mapping, Optional, Tuple, TypeVar, Union, cast)
+                    Literal, Optional, Tuple, TypeVar, Union, cast)
 
 import jinja2.nodes
 import transformers.utils.chat_template_utils as hf_chat_utils
@@ -23,6 +23,8 @@ from openai.types.chat import (
     ChatCompletionMessageParam as OpenAIChatCompletionMessageParam)
 from openai.types.chat import (ChatCompletionMessageToolCallParam,
                                ChatCompletionToolMessageParam)
+from openai.types.chat.chat_completion_content_part_input_audio_param import (
+    InputAudio)
 # yapf: enable
 # pydantic needs the TypedDict from typing_extensions
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
@@ -31,11 +33,7 @@ from typing_extensions import Required, TypeAlias, TypedDict
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalDataDict
-from vllm.multimodal.utils import (async_get_and_parse_audio,
-                                   async_get_and_parse_image,
-                                   async_get_and_parse_video,
-                                   get_and_parse_audio, get_and_parse_image,
-                                   get_and_parse_video)
+from vllm.multimodal.utils import MediaConnector
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import print_warning_once
 
@@ -368,14 +366,17 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
         self._tokenizer = tokenizer
         self._allowed_items = (model_config.multimodal_config.limit_per_prompt
                                if model_config.multimodal_config else {})
-        self._consumed_items = {k: 0 for k in self._allowed_items}
 
-        self._items: List[_T] = []
+        self._items_by_modality = defaultdict[str, list[_T]](list)
 
     @property
     def model_config(self) -> ModelConfig:
         return self._model_config
 
+    @property
+    def allowed_local_media_path(self):
+        return self._model_config.allowed_local_media_path
+
     @staticmethod
     @lru_cache(maxsize=None)
     def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str:
@@ -435,38 +436,19 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
         else:
             raise TypeError(f"Unknown modality: {modality}")
 
-    @staticmethod
-    def _combine(items: List[MultiModalDataDict]) -> MultiModalDataDict:
-        mm_lists: Mapping[str, List[object]] = defaultdict(list)
-
-        # Merge all the multi-modal items
-        for single_mm_data in items:
-            for mm_key, mm_item in single_mm_data.items():
-                if isinstance(mm_item, list):
-                    mm_lists[mm_key].extend(mm_item)
-                else:
-                    mm_lists[mm_key].append(mm_item)
-
-        # Unpack any single item lists for models that don't expect multiple.
-        return {
-            mm_key: mm_list[0] if len(mm_list) == 1 else mm_list
-            for mm_key, mm_list in mm_lists.items()
-        }
-
     def add(self, modality: ModalityStr, item: _T) -> Optional[str]:
         """
         Add a multi-modal item to the current prompt and returns the
         placeholder string to use, if any.
         """
         allowed_count = self._allowed_items.get(modality, 1)
-        current_count = self._consumed_items.get(modality, 0) + 1
+        current_count = len(self._items_by_modality[modality]) + 1
         if current_count > allowed_count:
             raise ValueError(
                 f"At most {allowed_count} {modality}(s) may be provided in "
                 "one request.")
 
-        self._consumed_items[modality] = current_count
-        self._items.append(item)
+        self._items_by_modality[modality].append(item)
 
         return self._placeholder_str(modality, current_count)
 
@@ -475,22 +457,26 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
         raise NotImplementedError
 
 
-class MultiModalItemTracker(BaseMultiModalItemTracker[MultiModalDataDict]):
+class MultiModalItemTracker(BaseMultiModalItemTracker[object]):
 
     def all_mm_data(self) -> Optional[MultiModalDataDict]:
-        return self._combine(self._items) if self._items else None
+        if self._items_by_modality:
+            return dict(self._items_by_modality)
+
+        return None
 
     def create_parser(self) -> "BaseMultiModalContentParser":
         return MultiModalContentParser(self)
 
 
-class AsyncMultiModalItemTracker(
-        BaseMultiModalItemTracker[Awaitable[MultiModalDataDict]]):
+class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
 
     async def all_mm_data(self) -> Optional[MultiModalDataDict]:
-        if self._items:
-            items = await asyncio.gather(*self._items)
-            return self._combine(items)
+        if self._items_by_modality:
+            return {
+                modality: await asyncio.gather(*items)
+                for modality, items in self._items_by_modality.items()
+            }
 
         return None
 
@@ -522,7 +508,7 @@ class BaseMultiModalContentParser(ABC):
         raise NotImplementedError
 
     @abstractmethod
-    def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
+    def parse_input_audio(self, input_audio: InputAudio) -> None:
         raise NotImplementedError
 
     @abstractmethod
@@ -537,31 +523,31 @@ class MultiModalContentParser(BaseMultiModalContentParser):
 
         self._tracker = tracker
 
+        self._connector = MediaConnector(
+            allowed_local_media_path=tracker.allowed_local_media_path,
+        )
+
     def parse_image(self, image_url: str) -> None:
-        image = get_and_parse_image(image_url,
-                                    allowed_local_media_path=self._tracker.
-                                    _model_config.allowed_local_media_path)
+        image = self._connector.fetch_image(image_url)
 
         placeholder = self._tracker.add("image", image)
         self._add_placeholder(placeholder)
 
     def parse_audio(self, audio_url: str) -> None:
-        audio = get_and_parse_audio(audio_url)
+        audio = self._connector.fetch_audio(audio_url)
 
         placeholder = self._tracker.add("audio", audio)
         self._add_placeholder(placeholder)
 
-    def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
-        input_audio_data = input_audio.get("data","")
-        input_audio_format = input_audio.get("format","")
-        audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}"
-        audio = get_and_parse_audio(audio_url)
+    def parse_input_audio(self, input_audio: InputAudio) -> None:
+        audio_data = input_audio.get("data", "")
+        audio_format = input_audio.get("format", "")
+        audio_url = f"data:audio/{audio_format};base64,{audio_data}"
 
-        placeholder = self._tracker.add("audio", audio)
-        self._add_placeholder(placeholder)
+        return self.parse_audio(audio_url)
 
     def parse_video(self, video_url: str) -> None:
-        video = get_and_parse_video(video_url)
+        video = self._connector.fetch_video(video_url)
 
         placeholder = self._tracker.add("video", video)
         self._add_placeholder(placeholder)
@@ -573,33 +559,31 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
         super().__init__()
 
         self._tracker = tracker
+        self._connector = MediaConnector(
+            allowed_local_media_path=tracker.allowed_local_media_path,
+        )
 
     def parse_image(self, image_url: str) -> None:
-        image_coro = async_get_and_parse_image(
-            image_url,
-            allowed_local_media_path=self._tracker._model_config.
-            allowed_local_media_path)
+        image_coro = self._connector.fetch_image_async(image_url)
 
         placeholder = self._tracker.add("image", image_coro)
         self._add_placeholder(placeholder)
 
     def parse_audio(self, audio_url: str) -> None:
-        audio_coro = async_get_and_parse_audio(audio_url)
+        audio_coro = self._connector.fetch_audio_async(audio_url)
 
         placeholder = self._tracker.add("audio", audio_coro)
         self._add_placeholder(placeholder)
 
-    def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
-        input_audio_data = input_audio.get("data","")
-        input_audio_format = input_audio.get("format","")
-        audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}"
-        audio_coro = async_get_and_parse_audio(audio_url)
+    def parse_input_audio(self, input_audio: InputAudio) -> None:
+        audio_data = input_audio.get("data", "")
+        audio_format = input_audio.get("format", "")
+        audio_url = f"data:audio/{audio_format};base64,{audio_data}"
 
-        placeholder = self._tracker.add("audio", audio_coro)
-        self._add_placeholder(placeholder)
+        return self.parse_audio(audio_url)
 
     def parse_video(self, video_url: str) -> None:
-        video = async_get_and_parse_video(video_url)
+        video = self._connector.fetch_video_async(video_url)
 
         placeholder = self._tracker.add("video", video)
         self._add_placeholder(placeholder)
@@ -695,10 +679,13 @@ _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
 _VideoParser = partial(cast, ChatCompletionContentPartVideoParam)
 
+_ContentPart: TypeAlias = Union[str, Dict[str, str], InputAudio]
+
 # Define a mapping from part types to their corresponding parsing functions.
-MM_PARSER_MAP: Dict[str,
-                    Callable[[ChatCompletionContentPartParam],
-                             Union[str, Dict[str,str]]]] = {
+MM_PARSER_MAP: Dict[
+    str,
+    Callable[[ChatCompletionContentPartParam], _ContentPart],
+] = {
     "text":
     lambda part: _TextParser(part).get("text", ""),
     "image_url":
@@ -715,8 +702,7 @@ MM_PARSER_MAP: Dict[str,
 
 
 def _parse_chat_message_content_mm_part(
-        part: ChatCompletionContentPartParam) -> Tuple[str,
-                                                Union[str, Dict[str, str]]]:
+        part: ChatCompletionContentPartParam) -> tuple[str, _ContentPart]:
     """
     Parses a given multi-modal content part based on its type.
 
@@ -783,7 +769,7 @@ def _parse_chat_message_content_parts(
     *,
     wrap_dicts: bool,
 ) -> List[ConversationMessage]:
-    content: List[Union[str, Dict[str, str]]] = []
+    content = list[_ContentPart]()
 
     mm_parser = mm_tracker.create_parser()
 
@@ -814,7 +800,7 @@ def _parse_chat_message_content_part(
     mm_parser: BaseMultiModalContentParser,
     *,
     wrap_dicts: bool,
-) -> Optional[Union[str, Dict[str, str]]]:
+) -> Optional[_ContentPart]:
     """Parses a single part of a conversation. If wrap_dicts is True,
     structured dictionary pieces for texts and images will be
     wrapped in dictionaries, i.e., {"type": "text", "text", ...} and
@@ -823,8 +809,7 @@ def _parse_chat_message_content_part(
     with multimodal placeholders.
     """
     if isinstance(part, str):  # Handle plain text parts
-        text = _TextParser(part)
-        return text
+        return part
 
     # Handle structured dictionary parts
     part_type, content = _parse_chat_message_content_mm_part(part)
@@ -855,7 +840,7 @@ def _parse_chat_message_content_part(
         return {'type': 'audio'} if wrap_dicts else None
 
     if part_type == "input_audio":
-        dict_content = cast(Dict[str, str], content)
+        dict_content = cast(InputAudio, content)
         mm_parser.parse_input_audio(dict_content)
         return {'type': 'audio'} if wrap_dicts else None
 
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index ed3bb82bf0aaa..3e09ef1fcbb56 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -1,10 +1,14 @@
+import base64
+from io import BytesIO
+from pathlib import Path
+
 import numpy as np
 import numpy.typing as npt
 
 from vllm.inputs.registry import InputContext
 from vllm.utils import PlaceholderModule
 
-from .base import MultiModalPlugin
+from .base import MediaIO, MultiModalPlugin
 from .inputs import AudioItem, MultiModalData, MultiModalKwargs
 
 try:
@@ -12,6 +16,11 @@ try:
 except ImportError:
     librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
 
+try:
+    import soundfile
+except ImportError:
+    soundfile = PlaceholderModule("soundfile")  # type: ignore[assignment]
+
 
 class AudioPlugin(MultiModalPlugin):
     """Plugin for audio data."""
@@ -39,3 +48,28 @@ def resample_audio(
     target_sr: float,
 ) -> npt.NDArray[np.floating]:
     return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
+
+
+class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
+
+    def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
+        return librosa.load(BytesIO(data), sr=None)
+
+    def load_base64(
+        self,
+        media_type: str,
+        data: str,
+    ) -> tuple[npt.NDArray, float]:
+        return self.load_bytes(base64.b64decode(data))
+
+    def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
+        return librosa.load(filepath, sr=None)
+
+    def encode_base64(self, media: tuple[npt.NDArray, float]) -> str:
+        audio, sr = media
+
+        with BytesIO() as buffer:
+            soundfile.write(buffer, audio, sr, format="WAV")
+            data = buffer.getvalue()
+
+        return base64.b64encode(data).decode('utf-8')
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 1e5a46946c6c0..10488e24b30cc 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,6 +1,7 @@
 from abc import ABC, abstractmethod
 from collections import defaultdict
-from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple,
+from pathlib import Path
+from typing import (TYPE_CHECKING, Any, Callable, Generic, NamedTuple,
                     Optional, Sequence, Tuple, Type, TypeVar, Union)
 
 from torch import nn
@@ -118,7 +119,7 @@ class MultiModalPlugin(ABC):
         self,
         model_config: "ModelConfig",
         data: MultiModalData[Any],
-        mm_processor_kwargs: Optional[Dict[str, Any]],
+        mm_processor_kwargs: Optional[dict[str, Any]],
     ) -> MultiModalKwargs:
         """
         Transform the data into a dictionary of model inputs using the
@@ -254,10 +255,10 @@ class MultiModalPlaceholderMap:
     """
 
     class IndexMap(NamedTuple):
-        src: List[int]
-        dest: List[int]
+        src: list[int]
+        dest: list[int]
 
-    src_ranges: List[range]
+    src_ranges: list[range]
     """
     The indices of the multi-modal embeddings that will replace the
     corresponding placeholder embeddings pointed to by ``dest_ranges``.
@@ -268,7 +269,7 @@ class MultiModalPlaceholderMap:
     The total number of flattened multi-modal embeddings.
     """
 
-    dest_ranges: List[range]
+    dest_ranges: list[range]
     """
     The indices of the placeholder embeddings that will be replaced by the
     multimodal embeddings.
@@ -288,7 +289,7 @@ class MultiModalPlaceholderMap:
     @classmethod
     def from_seq_group(
         cls, seq_group: "SequenceGroupMetadata", positions: range
-    ) -> Tuple[Optional[MultiModalDataDict], Dict[str,
+    ) -> Tuple[Optional[MultiModalDataDict], dict[str,
                                                   "MultiModalPlaceholderMap"]]:
         """
         Returns the multi-modal items that intersect with the portion of a
@@ -376,9 +377,9 @@ class MultiModalPlaceholderMap:
     def append_items_from_seq_group(
         self,
         positions: range,
-        multi_modal_items: List[_T],
+        multi_modal_items: list[_T],
         multi_modal_placeholders: Sequence[PlaceholderRange],
-    ) -> List[_T]:
+    ) -> list[_T]:
         """
         Adds the multi-modal items that intersect ```positions`` to this
         placeholder map and returns the intersecting items.
@@ -454,3 +455,22 @@ class MultiModalPlaceholderMap:
 
         return MultiModalPlaceholderMap.IndexMap(src=src_indices,
                                                  dest=dest_indices)
+
+
+class MediaIO(ABC, Generic[_T]):
+
+    @abstractmethod
+    def load_bytes(self, data: bytes) -> _T:
+        raise NotImplementedError
+
+    @abstractmethod
+    def load_base64(self, media_type: str, data: str) -> _T:
+        """
+        List of media types:
+        https://www.iana.org/assignments/media-types/media-types.xhtml
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def load_file(self, filepath: Path) -> _T:
+        raise NotImplementedError
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index c705e1a3d1554..14c79dfadec0c 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,4 +1,7 @@
+import base64
 from functools import lru_cache
+from io import BytesIO
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import torch
@@ -9,7 +12,7 @@ from vllm.logger import init_logger
 from vllm.transformers_utils.processor import get_image_processor
 from vllm.utils import is_list_of
 
-from .base import MultiModalPlugin
+from .base import MediaIO, MultiModalPlugin
 from .inputs import ImageItem, MultiModalData, MultiModalKwargs
 
 if TYPE_CHECKING:
@@ -96,3 +99,39 @@ def rescale_image_size(image: Image.Image,
     if transpose >= 0:
         image = image.transpose(Image.Transpose(transpose))
     return image
+
+
+class ImageMediaIO(MediaIO[Image.Image]):
+
+    def __init__(self, *, image_mode: str = "RGB") -> None:
+        super().__init__()
+
+        self.image_mode = image_mode
+
+    def load_bytes(self, data: bytes) -> Image.Image:
+        image = Image.open(BytesIO(data))
+        image.load()
+        return image.convert(self.image_mode)
+
+    def load_base64(self, media_type: str, data: str) -> Image.Image:
+        return self.load_bytes(base64.b64decode(data))
+
+    def load_file(self, filepath: Path) -> Image.Image:
+        image = Image.open(filepath)
+        image.load()
+        return image.convert(self.image_mode)
+
+    def encode_base64(
+        self,
+        media: Image.Image,
+        *,
+        image_format: str = "JPEG",
+    ) -> str:
+        image = media
+
+        with BytesIO() as buffer:
+            image = image.convert(self.image_mode)
+            image.save(buffer, image_format)
+            data = buffer.getvalue()
+
+        return base64.b64encode(data).decode('utf-8')
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index a49da2bdee972..87b12a6fb33c1 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -1,8 +1,7 @@
-import base64
-import os
 from functools import lru_cache
-from io import BytesIO
-from typing import List, Optional, Tuple, TypeVar, Union
+from pathlib import Path
+from typing import Optional, TypeVar, Union
+from urllib.parse import ParseResult, urlparse
 
 import numpy as np
 import numpy.typing as npt
@@ -10,283 +9,246 @@ import torch
 from PIL import Image
 
 import vllm.envs as envs
-from vllm.connections import global_http_connection
+from vllm.connections import HTTPConnection, global_http_connection
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
-from vllm.utils import PlaceholderModule
 
-from .inputs import MultiModalDataDict, PlaceholderRange
-
-try:
-    import decord
-except ImportError:
-    decord = PlaceholderModule("decord")  # type: ignore[assignment]
-
-try:
-    import librosa
-except ImportError:
-    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
-
-try:
-    import soundfile
-except ImportError:
-    soundfile = PlaceholderModule("soundfile")  # type: ignore[assignment]
+from .audio import AudioMediaIO
+from .base import MediaIO
+from .image import ImageMediaIO
+from .inputs import PlaceholderRange
+from .video import VideoMediaIO
 
 logger = init_logger(__name__)
 
 cached_get_tokenizer = lru_cache(get_tokenizer)
 
-
-def _load_image_from_bytes(b: bytes) -> Image.Image:
-    image = Image.open(BytesIO(b))
-    image.load()
-    return image
+_M = TypeVar("_M")
 
 
-def _is_subpath(image_path: str, allowed_local_media_path: str) -> bool:
-    # Get the common path
-    common_path = os.path.commonpath([
-        os.path.abspath(image_path),
-        os.path.abspath(allowed_local_media_path)
-    ])
-    # Check if the common path is the same as allowed_local_media_path
-    return common_path == os.path.abspath(allowed_local_media_path)
+class MediaConnector:
 
+    def __init__(
+        self,
+        connection: HTTPConnection = global_http_connection,
+        *,
+        allowed_local_media_path: str = "",
+    ) -> None:
+        super().__init__()
 
-def _load_image_from_file(image_url: str,
-                          allowed_local_media_path: str) -> Image.Image:
-    if not allowed_local_media_path:
-        raise ValueError("Invalid 'image_url': Cannot load local files without"
-                         "'--allowed-local-media-path'.")
-    if allowed_local_media_path:
-        if not os.path.exists(allowed_local_media_path):
+        self.connection = connection
+
+        if allowed_local_media_path:
+            allowed_local_media_path_ = Path(allowed_local_media_path)
+
+            if not allowed_local_media_path_.exists():
+                raise ValueError(
+                    "Invalid `--allowed-local-media-path`: The path "
+                    f"{allowed_local_media_path_} does not exist.")
+            if not allowed_local_media_path_.is_dir():
+                raise ValueError(
+                    "Invalid `--allowed-local-media-path`: The path "
+                    f"{allowed_local_media_path_} must be a directory.")
+        else:
+            allowed_local_media_path_ = None
+
+        self.allowed_local_media_path = allowed_local_media_path_
+
+    def _load_data_url(
+        self,
+        url_spec: ParseResult,
+        media_io: MediaIO[_M],
+    ) -> _M:
+        data_spec, data = url_spec.path.split(",", 1)
+        media_type, data_type = data_spec.split(";", 1)
+
+        if data_type != "base64":
+            msg = "Only base64 data URLs are supported for now."
+            raise NotImplementedError(msg)
+
+        return media_io.load_base64(media_type, data)
+
+    def _load_file_url(
+        self,
+        url_spec: ParseResult,
+        media_io: MediaIO[_M],
+    ) -> _M:
+        allowed_local_media_path = self.allowed_local_media_path
+        if allowed_local_media_path is None:
+            raise RuntimeError("Cannot load local files without "
+                               "`--allowed-local-media-path`.")
+
+        filepath = Path(url_spec.path)
+        if allowed_local_media_path not in filepath.resolve().parents:
             raise ValueError(
-                "Invalid '--allowed-local-media-path': "
-                f"The path {allowed_local_media_path} does not exist.")
-        if not os.path.isdir(allowed_local_media_path):
-            raise ValueError(
-                "Invalid '--allowed-local-media-path': "
-                f"The path {allowed_local_media_path} must be a directory.")
+                f"The file path {filepath} must be a subpath "
+                f"of `--allowed-local-media-path` {allowed_local_media_path}.")
 
-    # Only split once and assume the second part is the image path
-    _, image_path = image_url.split("file://", 1)
-    if not _is_subpath(image_path, allowed_local_media_path):
-        raise ValueError(
-            f"Invalid 'image_url': The file path {image_path} must"
-            " be a subpath of '--allowed-local-media-path'"
-            f" '{allowed_local_media_path}'.")
+        return media_io.load_file(filepath)
 
-    image = Image.open(image_path)
-    image.load()
-    return image
+    def load_from_url(
+        self,
+        url: str,
+        media_io: MediaIO[_M],
+        *,
+        fetch_timeout: Optional[int] = None,
+    ) -> _M:
+        url_spec = urlparse(url)
 
+        if url_spec.scheme.startswith("http"):
+            connection = self.connection
+            data = connection.get_bytes(url, timeout=fetch_timeout)
 
-def _load_image_from_data_url(image_url: str) -> Image.Image:
-    # Only split once and assume the second part is the base64 encoded image
-    _, image_base64 = image_url.split(",", 1)
-    return load_image_from_base64(image_base64)
+            return media_io.load_bytes(data)
 
+        if url_spec.scheme == "data":
+            return self._load_data_url(url_spec, media_io)
 
-def fetch_image(image_url: str,
-                *,
-                image_mode: str = "RGB",
-                allowed_local_media_path: str = "") -> Image.Image:
-    """
-    Load a PIL image from a HTTP or base64 data URL.
+        if url_spec.scheme == "file":
+            return self._load_file_url(url_spec, media_io)
 
-    By default, the image is converted into RGB format.
-    """
-    if image_url.startswith('http'):
-        image_raw = global_http_connection.get_bytes(
-            image_url,
-            timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
-        )
-        image = _load_image_from_bytes(image_raw)
+        msg = "The URL must be either a HTTP, data or file URL."
+        raise ValueError(msg)
 
-    elif image_url.startswith('data:image'):
-        image = _load_image_from_data_url(image_url)
-    elif image_url.startswith('file://'):
-        image = _load_image_from_file(image_url, allowed_local_media_path)
-    else:
-        raise ValueError("Invalid 'image_url': A valid 'image_url' must start "
-                         "with either 'data:image', 'file://' or 'http'.")
+    async def load_from_url_async(
+        self,
+        url: str,
+        media_io: MediaIO[_M],
+        *,
+        fetch_timeout: Optional[int] = None,
+    ) -> _M:
+        url_spec = urlparse(url)
 
-    return image.convert(image_mode)
+        if url_spec.scheme.startswith("http"):
+            connection = self.connection
+            data = await connection.async_get_bytes(url, timeout=fetch_timeout)
 
+            return media_io.load_bytes(data)
 
-async def async_fetch_image(image_url: str,
-                            *,
-                            image_mode: str = "RGB",
-                            allowed_local_media_path: str = "") -> Image.Image:
-    """
-    Asynchronously load a PIL image from a HTTP or base64 data URL.
+        if url_spec.scheme == "data":
+            return self._load_data_url(url_spec, media_io)
 
-    By default, the image is converted into RGB format.
-    """
-    if image_url.startswith('http'):
-        image_raw = await global_http_connection.async_get_bytes(
-            image_url,
-            timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
-        )
-        image = _load_image_from_bytes(image_raw)
+        if url_spec.scheme == "file":
+            return self._load_file_url(url_spec, media_io)
 
-    elif image_url.startswith('data:image'):
-        image = _load_image_from_data_url(image_url)
-    elif image_url.startswith('file://'):
-        image = _load_image_from_file(image_url, allowed_local_media_path)
-    else:
-        raise ValueError("Invalid 'image_url': A valid 'image_url' must start "
-                         "with either 'data:image', 'file://' or 'http'.")
+        msg = "The URL must be either a HTTP, data or file URL."
+        raise ValueError(msg)
 
-    return image.convert(image_mode)
+    def fetch_audio(
+        self,
+        audio_url: str,
+    ) -> tuple[np.ndarray, Union[int, float]]:
+        """
+        Load audio from a URL.
+        """
+        audio_io = AudioMediaIO()
 
-
-def _load_video_from_bytes(b: bytes, num_frames: int = 32) -> npt.NDArray:
-    video_path = BytesIO(b)
-    vr = decord.VideoReader(video_path, num_threads=1)
-    total_frame_num = len(vr)
-
-    if total_frame_num > num_frames:
-        uniform_sampled_frames = np.linspace(0,
-                                             total_frame_num - 1,
-                                             num_frames,
-                                             dtype=int)
-        frame_idx = uniform_sampled_frames.tolist()
-    else:
-        frame_idx = [i for i in range(0, total_frame_num)]
-    frames = vr.get_batch(frame_idx).asnumpy()
-
-    return frames
-
-
-def _load_video_from_data_url(video_url: str) -> npt.NDArray:
-    # Only split once and assume the second part is the base64 encoded video
-    _, video_base64 = video_url.split(",", 1)
-
-    if video_url.startswith("data:video/jpeg;"):
-        return np.stack([
-            np.array(load_image_from_base64(frame_base64))
-            for frame_base64 in video_base64.split(",")
-        ])
-
-    return load_video_from_base64(video_base64)
-
-
-def fetch_video(video_url: str, *, num_frames: int = 32) -> npt.NDArray:
-    """
-    Load video from a HTTP or base64 data URL.
-    """
-    if video_url.startswith('http') or video_url.startswith('https'):
-        video_raw = global_http_connection.get_bytes(
-            video_url,
-            timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
-        )
-        video = _load_video_from_bytes(video_raw, num_frames)
-    elif video_url.startswith('data:video'):
-        video = _load_video_from_data_url(video_url)
-    else:
-        raise ValueError("Invalid 'video_url': A valid 'video_url' must start "
-                         "with either 'data:video' or 'http'.")
-    return video
-
-
-async def async_fetch_video(video_url: str,
-                            *,
-                            num_frames: int = 32) -> npt.NDArray:
-    """
-    Asynchronously load video from a HTTP or base64 data URL.
-
-    By default, the image is converted into RGB format.
-    """
-    if video_url.startswith('http') or video_url.startswith('https'):
-        video_raw = await global_http_connection.async_get_bytes(
-            video_url,
-            timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
-        )
-        video = _load_video_from_bytes(video_raw, num_frames)
-    elif video_url.startswith('data:video'):
-        video = _load_video_from_data_url(video_url)
-    else:
-        raise ValueError("Invalid 'video_url': A valid 'video_url' must start "
-                         "with either 'data:video' or 'http'.")
-    return video
-
-
-def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
-    """
-    Load audio from a URL.
-    """
-    if audio_url.startswith("http"):
-        audio_bytes = global_http_connection.get_bytes(
+        return self.load_from_url(
             audio_url,
-            timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
+            audio_io,
+            fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
         )
-    elif audio_url.startswith("data:audio"):
-        _, audio_base64 = audio_url.split(",", 1)
-        audio_bytes = base64.b64decode(audio_base64)
-    else:
-        raise ValueError("Invalid 'audio_url': A valid 'audio_url' must start "
-                         "with either 'data:audio' or 'http'.")
 
-    return librosa.load(BytesIO(audio_bytes), sr=None)
+    async def fetch_audio_async(
+        self,
+        audio_url: str,
+    ) -> tuple[np.ndarray, Union[int, float]]:
+        """
+        Asynchronously fetch audio from a URL.
+        """
+        audio_io = AudioMediaIO()
 
-
-async def async_fetch_audio(
-        audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
-    """
-    Asynchronously fetch audio from a URL.
-    """
-    if audio_url.startswith("http"):
-        audio_bytes = await global_http_connection.async_get_bytes(
+        return await self.load_from_url_async(
             audio_url,
-            timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
+            audio_io,
+            fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
         )
-    elif audio_url.startswith("data:audio"):
-        _, audio_base64 = audio_url.split(",", 1)
-        audio_bytes = base64.b64decode(audio_base64)
-    else:
-        raise ValueError("Invalid 'audio_url': A valid 'audio_url' must start "
-                         "with either 'data:audio' or 'http'.")
 
-    return librosa.load(BytesIO(audio_bytes), sr=None)
-
-
-def get_and_parse_audio(audio_url: str) -> MultiModalDataDict:
-    audio, sr = fetch_audio(audio_url)
-    return {"audio": (audio, sr)}
-
-
-def get_and_parse_image(
+    def fetch_image(
+        self,
         image_url: str,
         *,
-        allowed_local_media_path: str = "") -> MultiModalDataDict:
-    image = fetch_image(image_url,
-                        allowed_local_media_path=allowed_local_media_path)
-    return {"image": image}
+        image_mode: str = "RGB",
+    ) -> Image.Image:
+        """
+        Load a PIL image from a HTTP or base64 data URL.
 
+        By default, the image is converted into RGB format.
+        """
+        image_io = ImageMediaIO(image_mode=image_mode)
 
-def get_and_parse_video(video_url: str) -> MultiModalDataDict:
-    video = fetch_video(video_url)
-    return {"video": video}
+        return self.load_from_url(
+            image_url,
+            image_io,
+            fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+        )
 
-
-async def async_get_and_parse_audio(audio_url: str) -> MultiModalDataDict:
-    audio, sr = await async_fetch_audio(audio_url)
-    return {"audio": (audio, sr)}
-
-
-async def async_get_and_parse_image(
+    async def fetch_image_async(
+        self,
         image_url: str,
         *,
-        allowed_local_media_path: str = "") -> MultiModalDataDict:
-    image = await async_fetch_image(
-        image_url, allowed_local_media_path=allowed_local_media_path)
-    return {"image": image}
+        image_mode: str = "RGB",
+    ) -> Image.Image:
+        """
+        Asynchronously load a PIL image from a HTTP or base64 data URL.
+
+        By default, the image is converted into RGB format.
+        """
+        image_io = ImageMediaIO(image_mode=image_mode)
+
+        return await self.load_from_url_async(
+            image_url,
+            image_io,
+            fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+        )
+
+    def fetch_video(
+        self,
+        video_url: str,
+        *,
+        image_mode: str = "RGB",
+        num_frames: int = 32,
+    ) -> npt.NDArray:
+        """
+        Load video from a HTTP or base64 data URL.
+        """
+        image_io = ImageMediaIO(image_mode=image_mode)
+        video_io = VideoMediaIO(image_io, num_frames=num_frames)
+
+        return self.load_from_url(
+            video_url,
+            video_io,
+            fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
+        )
+
+    async def fetch_video_async(
+        self,
+        video_url: str,
+        *,
+        image_mode: str = "RGB",
+        num_frames: int = 32,
+    ) -> npt.NDArray:
+        """
+        Asynchronously load video from a HTTP or base64 data URL.
+
+        By default, the image is converted into RGB format.
+        """
+        image_io = ImageMediaIO(image_mode=image_mode)
+        video_io = VideoMediaIO(image_io, num_frames=num_frames)
+
+        return await self.load_from_url_async(
+            video_url,
+            video_io,
+            fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
+        )
 
 
-async def async_get_and_parse_video(video_url: str) -> MultiModalDataDict:
-    video = await async_fetch_video(video_url)
-    return {"video": video}
+global_media_connector = MediaConnector()
+"""The global :class:`MediaConnector` instance used by vLLM."""
+
+fetch_audio = global_media_connector.fetch_audio
+fetch_image = global_media_connector.fetch_image
+fetch_video = global_media_connector.fetch_video
 
 
 def encode_audio_base64(
@@ -294,10 +256,8 @@ def encode_audio_base64(
     sampling_rate: int,
 ) -> str:
     """Encode audio as base64."""
-    buffered = BytesIO()
-    soundfile.write(buffered, audio, sampling_rate, format="WAV")
-
-    return base64.b64encode(buffered.getvalue()).decode('utf-8')
+    audio_io = AudioMediaIO()
+    return audio_io.encode_base64((audio, sampling_rate))
 
 
 def encode_image_base64(
@@ -311,29 +271,14 @@ def encode_image_base64(
 
     By default, the image is converted into RGB format before being encoded.
     """
-    buffered = BytesIO()
-    image = image.convert(image_mode)
-    image.save(buffered, format)
-    return base64.b64encode(buffered.getvalue()).decode('utf-8')
-
-
-def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
-    """Load image from base64 format."""
-    return _load_image_from_bytes(base64.b64decode(image))
+    image_io = ImageMediaIO(image_mode=image_mode)
+    return image_io.encode_base64(image, image_format=format)
 
 
 def encode_video_base64(frames: npt.NDArray) -> str:
-    base64_frames = []
-    frames_list = [frames[i] for i in range(frames.shape[0])]
-    for frame in frames_list:
-        img_base64 = encode_image_base64(Image.fromarray(frame))
-        base64_frames.append(img_base64)
-    return ",".join(base64_frames)
-
-
-def load_video_from_base64(video: Union[bytes, str]) -> npt.NDArray:
-    """Load video from base64 format."""
-    return _load_video_from_bytes(base64.b64decode(video))
+    image_io = ImageMediaIO()
+    video_io = VideoMediaIO(image_io)
+    return video_io.encode_base64(frames)
 
 
 def resolve_visual_encoder_outputs(
@@ -389,7 +334,7 @@ def repeat_and_pad_token(
     repeat_count: int = 1,
     pad_token_left: Optional[_T] = None,
     pad_token_right: Optional[_T] = None,
-) -> List[_T]:
+) -> list[_T]:
     replacement = [token] * repeat_count
     if pad_token_left is not None:
         replacement = [pad_token_left] + replacement
@@ -402,13 +347,13 @@ def repeat_and_pad_token(
 def repeat_and_pad_placeholder_tokens(
     tokenizer: AnyTokenizer,
     prompt: Optional[str],
-    prompt_token_ids: List[int],
+    prompt_token_ids: list[int],
     *,
     placeholder_token_id: int,
-    repeat_count: Union[int, List[int]],
+    repeat_count: Union[int, list[int]],
     pad_token_left: Optional[int] = None,
     pad_token_right: Optional[int] = None,
-) -> Tuple[Optional[str], List[int], List[PlaceholderRange]]:
+) -> tuple[Optional[str], list[int], list[PlaceholderRange]]:
     if isinstance(repeat_count, int):
         repeat_count = [repeat_count]
 
@@ -450,8 +395,8 @@ def repeat_and_pad_placeholder_tokens(
             new_prompt += prompt_parts[i] + replacement_str
         new_prompt += prompt_parts[-1]
 
-    new_token_ids: List[int] = []
-    placeholder_ranges: List[PlaceholderRange] = []
+    new_token_ids = list[int]()
+    placeholder_ranges = list[PlaceholderRange]()
     placeholder_token_idx = 0
     for i, token in enumerate(prompt_token_ids):
         if token == placeholder_token_id:
@@ -481,7 +426,7 @@ def repeat_and_pad_placeholder_tokens(
 def consecutive_placeholder_ranges(
         num_items: int,
         item_size: int,
-        initial_offset: int = 0) -> List[PlaceholderRange]:
+        initial_offset: int = 0) -> list[PlaceholderRange]:
     """Returns a list of consecutive PlaceholderRanges of a fixed size"""
 
     return [
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index c4be100562703..b7d43c830cc46 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -1,23 +1,32 @@
-from functools import lru_cache
+import base64
+from functools import lru_cache, partial
+from io import BytesIO
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import cv2
 import numpy as np
 import numpy.typing as npt
+from PIL import Image
 
 from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.processor import get_video_processor
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.utils import is_list_of
+from vllm.utils import PlaceholderModule, is_list_of
 
-from .base import MultiModalData
-from .image import ImagePlugin
+from .base import MediaIO, MultiModalData
+from .image import ImageMediaIO, ImagePlugin
 from .inputs import MultiModalKwargs, VideoItem
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
 
+try:
+    import decord
+except ImportError:
+    decord = PlaceholderModule("decord")  # type: ignore[assignment]
+
 logger = init_logger(__name__)
 
 cached_get_video_processor = lru_cache(get_video_processor)
@@ -107,3 +116,73 @@ def sample_frames_from_video(frames: npt.NDArray,
     frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
     sampled_frames = frames[frame_indices, ...]
     return sampled_frames
+
+
+class VideoMediaIO(MediaIO[npt.NDArray]):
+
+    def __init__(
+        self,
+        image_io: ImageMediaIO,
+        *,
+        num_frames: int = 32,
+    ) -> None:
+        super().__init__()
+
+        self.image_io = image_io
+        self.num_frames = num_frames
+
+    def load_bytes(self, data: bytes) -> npt.NDArray:
+        vr = decord.VideoReader(BytesIO(data), num_threads=1)
+        total_frame_num = len(vr)
+
+        num_frames = self.num_frames
+        if total_frame_num > num_frames:
+            uniform_sampled_frames = np.linspace(0,
+                                                 total_frame_num - 1,
+                                                 num_frames,
+                                                 dtype=int)
+            frame_idx = uniform_sampled_frames.tolist()
+        else:
+            frame_idx = list(range(0, total_frame_num))
+
+        return vr.get_batch(frame_idx).asnumpy()
+
+    def load_base64(self, media_type: str, data: str) -> npt.NDArray:
+        if media_type.lower() == "video/jpeg":
+            load_frame = partial(
+                self.image_io.load_base64,
+                "image/jpeg",
+            )
+
+            return np.stack([
+                np.array(load_frame(frame_data))
+                for frame_data in data.split(",")
+            ])
+
+        return self.load_bytes(base64.b64decode(data))
+
+    def load_file(self, filepath: Path) -> npt.NDArray:
+        with filepath.open("rb") as f:
+            data = f.read()
+
+        return self.load_bytes(data)
+
+    def encode_base64(
+        self,
+        media: npt.NDArray,
+        *,
+        video_format: str = "JPEG",
+    ) -> str:
+        video = media
+
+        if video_format == "JPEG":
+            encode_frame = partial(
+                self.image_io.encode_base64,
+                image_format=video_format,
+            )
+
+            return ",".join(
+                encode_frame(Image.fromarray(frame)) for frame in video)
+
+        msg = "Only JPEG format is supported for now."
+        raise NotImplementedError(msg)

From 5ce4627a7ec4cf4e19ff4be7f030883ef486393f Mon Sep 17 00:00:00 2001
From: Chen1022 <112855051+ccjincong@users.noreply.github.com>
Date: Fri, 27 Dec 2024 21:05:10 +0800
Subject: [PATCH 08/48] [Doc]  Add xgrammar in doc (#11549)

Signed-off-by: ccjincong <chenjincong11@gmail.com>
---
 docs/source/usage/structured_outputs.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/usage/structured_outputs.md b/docs/source/usage/structured_outputs.md
index 3f5d9ffc26278..7292012e36a26 100644
--- a/docs/source/usage/structured_outputs.md
+++ b/docs/source/usage/structured_outputs.md
@@ -2,7 +2,7 @@
 
 # Structured Outputs
 
-vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines) or [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer) as backends for the guided decoding.
+vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines), [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer), or [xgrammar](https://github.com/mlc-ai/xgrammar) as backends for the guided decoding.
 This document shows you some examples of the different options that are available to generate structured outputs.
 
 ## Online Inference (OpenAI API)

From 101418096ffe3c83b6d541e1303b10e9d5e03861 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 28 Dec 2024 01:22:48 +0800
Subject: [PATCH 09/48] [VLM] Support caching in merged multi-modal processor
 (#11396)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/conf.py                           |   3 +-
 .../design/multimodal/multimodal_index.md     |  28 +-
 docs/source/models/supported_models.md        |   3 +-
 .../openai/test_vision_embedding.py           |   4 +-
 .../mm_processor_kwargs/test_qwen2_vl.py      |   2 +-
 .../vision_language/test_models.py            |   4 +-
 tests/multimodal/test_processing.py           | 207 ++++++-
 vllm/inputs/registry.py                       |  22 +-
 vllm/model_executor/models/llava.py           | 182 +++---
 vllm/model_executor/models/phi3v.py           | 107 +++-
 vllm/model_executor/models/qwen.py            |   4 +-
 vllm/model_executor/models/qwen2_audio.py     |  65 ++-
 vllm/model_executor/models/qwen2_vl.py        | 115 ++--
 vllm/model_executor/models/ultravox.py        |  76 ++-
 vllm/multimodal/base.py                       |  44 +-
 vllm/multimodal/inputs.py                     | 438 ++++++++++++++-
 vllm/multimodal/processing.py                 | 518 ++++++++++++------
 vllm/multimodal/registry.py                   |  50 +-
 vllm/transformers_utils/processor.py          |  12 +-
 vllm/utils.py                                 |  27 +-
 20 files changed, 1459 insertions(+), 452 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 1fe0474631140..71394c5302a39 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -191,6 +191,7 @@ def linkcode_resolve(domain, info):
 
 # Mock out external dependencies here, otherwise the autodoc pages may be blank.
 autodoc_mock_imports = [
+    "blake3",
     "compressed_tensors",
     "cpuinfo",
     "cv2",
@@ -207,7 +208,7 @@ autodoc_mock_imports = [
     "tensorizer",
     "pynvml",
     "outlines",
-    "xgrammar,"
+    "xgrammar",
     "librosa",
     "soundfile",
     "gguf",
diff --git a/docs/source/design/multimodal/multimodal_index.md b/docs/source/design/multimodal/multimodal_index.md
index 88af07afc7018..e4f2171e84ff7 100644
--- a/docs/source/design/multimodal/multimodal_index.md
+++ b/docs/source/design/multimodal/multimodal_index.md
@@ -45,31 +45,23 @@ adding_multimodal_plugin
 ### Base Classes
 
 ```{eval-rst}
-.. autodata:: vllm.multimodal.NestedTensors
-```
-
-```{eval-rst}
-.. autodata:: vllm.multimodal.BatchedTensorInputs
-```
-
-```{eval-rst}
-.. autoclass:: vllm.multimodal.MultiModalDataBuiltins
+.. automodule:: vllm.multimodal.base
     :members:
     :show-inheritance:
 ```
 
-```{eval-rst}
-.. autodata:: vllm.multimodal.MultiModalDataDict
-```
+### Input Classes
 
 ```{eval-rst}
-.. autoclass:: vllm.multimodal.MultiModalKwargs
+.. automodule:: vllm.multimodal.inputs
     :members:
     :show-inheritance:
 ```
 
+### Audio Classes
+
 ```{eval-rst}
-.. autoclass:: vllm.multimodal.MultiModalPlugin
+.. automodule:: vllm.multimodal.audio
     :members:
     :show-inheritance:
 ```
@@ -81,3 +73,11 @@ adding_multimodal_plugin
     :members:
     :show-inheritance:
 ```
+
+### Video Classes
+
+```{eval-rst}
+.. automodule:: vllm.multimodal.video
+    :members:
+    :show-inheritance:
+```
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 95add0d71bbab..7acafda50793c 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -755,8 +755,7 @@ vLLM currently only supports adding LoRA to the language backbone of multimodal
 ```
 
 ```{note}
-To use {code}`TIGER-Lab/Mantis-8B-siglip-llama3`, you have to install their GitHub repo ({code}`pip install git+https://github.com/TIGER-AI-Lab/Mantis.git`)
-and pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
+To use {code}`TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
 ```
 
 ```{note}
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index 3731b2dcdeae1..c851539c610ec 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -91,5 +91,5 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
     assert len(embeddings.data) == 1
     assert len(embeddings.data[0].embedding) == 3072
     assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 765
-    assert embeddings.usage.total_tokens == 765
+    assert embeddings.usage.prompt_tokens == 764
+    assert embeddings.usage.total_tokens == 764
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
index cd8954ffc48c2..5897c04c89e19 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
@@ -30,7 +30,7 @@ def get_max_qwen2_vl_image_tokens():
 
 
 @pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [
-    ({}, 1225),
+    ({}, 16384),
     ({
         MIN_PIXELS: 64**2,
         MAX_PIXELS: 512**2
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 3101d1d2ea831..1a9c1b4ef1be0 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -201,6 +201,7 @@ VLM_TEST_SETTINGS = {
         vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
         num_logprobs=10,
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        marks=[large_gpu_mark(min_gb=48)],
     ),
     "glm4": VLMTestInfo(
         models=["THUDM/glm-4v-9b"],
@@ -212,7 +213,7 @@ VLM_TEST_SETTINGS = {
         dtype="bfloat16",
         get_stop_token_ids=lambda tok: [151329, 151336, 151338],
         patch_hf_runner=model_utils.glm_patch_hf_runner,
-        marks=[large_gpu_mark(min_gb=48)],
+        marks=[large_gpu_mark(min_gb=32)],
     ),
     "h2ovl": VLMTestInfo(
         models = [
@@ -261,6 +262,7 @@ VLM_TEST_SETTINGS = {
         dtype="bfloat16",
         use_tokenizer_eos=True,
         patch_hf_runner=model_utils.internvl_patch_hf_runner,
+        marks=[large_gpu_mark(min_gb=32)],
     ),
     "llava_next": VLMTestInfo(
         models=["llava-hf/llava-v1.6-mistral-7b-hf"],
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index d22d778f81fa8..1b2847ed0f534 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -1,12 +1,20 @@
+from functools import partial
 from typing import cast
 
+import numpy as np
 import pytest
+from PIL import Image
 
-from vllm.multimodal.processing import (PromptReplacement, _PlaceholderInfo,
-                                        find_text_matches, find_token_matches,
-                                        iter_placeholders, iter_token_matches,
+from vllm.config import ModelConfig
+from vllm.inputs import InputProcessingContext
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.processing import (ProcessingCache, PromptReplacement,
+                                        _PlaceholderInfo, find_text_matches,
+                                        find_token_matches, iter_placeholders,
+                                        iter_token_matches,
                                         replace_text_matches,
                                         replace_token_matches)
+from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import full_groupby
 
@@ -457,6 +465,7 @@ def test_find_replace_tokens(
         ),
     ]
 )
+# yapf: enable
 def test_iter_placeholders(
     repl_by_key,
     prompt,
@@ -475,11 +484,199 @@ def test_iter_placeholders(
             prompt_repls,
             prompt,
             # Effectively match all occurrences in the prompt
-            {key: 3 for key in repl_by_key},
-         ))
+            {key: 3
+             for key in repl_by_key},
+        ))
 
     # Only displayed on error
     print("result:", result)
 
     # Manually constructed results
     assert result == expected
+
+
+def _rand_img(rng: np.random.RandomState, min_wh: int, max_wh: int):
+    w, h = rng.randint(min_wh, max_wh, size=(2, ))
+    arr = rng.randint(0, 255, size=(w, h, 3), dtype=np.uint8)
+    return Image.fromarray(arr)
+
+
+def _rand_video(
+    rng: np.random.RandomState,
+    min_frames: int,
+    max_frames: int,
+    min_wh: int,
+    max_wh: int,
+):
+    # Temporary workaround for https://github.com/huggingface/transformers/issues/35412
+    num_frames = rng.randint(min_frames, max_frames)
+    num_frames = (num_frames // 2) * 2
+
+    w, h = rng.randint(min_wh, max_wh, size=(2, ))
+    return rng.randint(0, 255, size=(num_frames, w, h, 3), dtype=np.uint8)
+
+
+def _rand_audio(
+    rng: np.random.RandomState,
+    min_len: int,
+    max_len: int,
+    sr: int,
+):
+    audio_len = rng.randint(min_len, max_len)
+    return rng.rand(audio_len), sr
+
+
+def _test_processing_cache_correctness(
+    model_id: str,
+    modalities: set[str],
+    hit_rate: float,
+    num_batches: int,
+    simplify_rate: float,
+):
+    if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3":
+        hf_overrides = {"architectures": ["MantisForConditionalGeneration"]}
+    else:
+        hf_overrides = {}
+
+    model_config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        seed=0,
+        dtype="float16",
+        revision=None,
+        hf_overrides=hf_overrides,
+    )
+    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
+
+    processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls]
+    ctx = InputProcessingContext(
+        model_config,
+        tokenizer=cached_get_tokenizer(model_config.tokenizer),
+    )
+    # Ensure that it can fit all of the data
+    cache = ProcessingCache(capacity=1 << 30)
+
+    baseline_processor = processor_factory(ctx, cache=None)
+    cached_processor = processor_factory(ctx, cache=cache)
+
+    rng = np.random.RandomState(0)
+
+    input_to_hit = {
+        "image": Image.new("RGB", size=(128, 128)),
+        "video": np.zeros((4, 128, 128, 3), dtype=np.uint8),
+        "audio": (np.zeros((512, )), 16000),
+    }
+    input_factory = {
+        "image":
+        partial(_rand_img, rng, min_wh=128, max_wh=256),
+        "video":
+        partial(_rand_video,
+                rng,
+                min_frames=2,
+                max_frames=8,
+                min_wh=128,
+                max_wh=256),
+        "audio":
+        partial(_rand_audio, rng, min_len=256, max_len=512, sr=16000),
+    }
+    input_max_count = {
+        "image": 3,
+        "video": 3,
+        "audio": 3,
+    }
+
+    for batch_idx in range(num_batches):
+        mm_data = {
+            k:
+            [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
+             for _ in range(rng.randint(input_max_count[k]))]
+            for k in modalities
+        }
+
+        mm_counts = {k: len(vs) for k, vs in mm_data.items()}
+        prompt = baseline_processor._get_dummy_mm_inputs(mm_counts).prompt_text
+
+        # Drop unnecessary keys and test single -> multi conversion
+        if rng.rand() < simplify_rate:
+            for k in list(mm_data.keys()):
+                if not mm_data[k]:
+                    del mm_data[k]
+                elif len(mm_data[k]) == 1:
+                    mm_data[k] = mm_data[k][0]
+
+        baseline_result = baseline_processor.apply(
+            prompt,
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+        cached_result = cached_processor.apply(
+            prompt,
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+
+        assert baseline_result == cached_result, (
+            f"Failed ({batch_idx=}, {mm_data=})")
+
+
+# yapf: disable
+@pytest.mark.parametrize(("model_id", "modalities"), [
+    ("llava-hf/llava-1.5-7b-hf", {"image"}),
+    ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image"}),
+    ("mistral-community/pixtral-12b", {"image"}),
+    ("Qwen/Qwen2-VL-2B-Instruct", {"image", "video"}),
+    ("Qwen/Qwen2-Audio-7B-Instruct", {"audio"}),
+    ("fixie-ai/ultravox-v0_3", {"audio"}),
+])
+@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
+@pytest.mark.parametrize("num_batches", [32])
+@pytest.mark.parametrize("simplify_rate", [1.0])
+# yapf: enable
+def test_processing_cache_correctness(
+    model_id: str,
+    modalities: set[str],
+    hit_rate: float,
+    num_batches: int,
+    simplify_rate: float,
+):
+    _test_processing_cache_correctness(
+        model_id,
+        modalities,
+        hit_rate=hit_rate,
+        num_batches=num_batches,
+        simplify_rate=simplify_rate,
+    )
+
+
+# yapf: disable
+@pytest.mark.parametrize(("model_id", "modalities"), [
+    ("microsoft/Phi-3-vision-128k-instruct", {"image"}),
+])
+@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
+@pytest.mark.parametrize("num_batches", [32])
+@pytest.mark.parametrize("simplify_rate", [1.0])
+# yapf: enable
+def test_processing_cache_correctness_phi3v(
+    model_id: str,
+    modalities: set[str],
+    hit_rate: float,
+    num_batches: int,
+    simplify_rate: float,
+):
+    # HACK - this is an attempted workaround for the following bug
+    # https://github.com/huggingface/transformers/issues/34307
+    from transformers import AutoImageProcessor  # noqa: F401
+    from transformers import AutoProcessor  # noqa: F401
+
+    AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True)
+
+    _test_processing_cache_correctness(
+        model_id,
+        modalities,
+        hit_rate=hit_rate,
+        num_batches=num_batches,
+        simplify_rate=simplify_rate,
+    )
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index f3ec9d115c9ba..46346b08e99c2 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -99,6 +99,9 @@ class InputContext:
 
         merged_kwargs = {**base_kwargs, **kwargs}
 
+        if isinstance(typ, type):
+            merged_kwargs["processor_cls"] = typ
+
         hf_processor = cached_get_processor(
             self.model_config.model,
             trust_remote_code=self.model_config.trust_remote_code,
@@ -132,10 +135,13 @@ class InputProcessingContext(InputContext):
     def call_hf_processor(
         self,
         hf_processor: ProcessorMixin,
-        prompt: str,
-        processor_data: Mapping[str, object],
-        inference_kwargs: Mapping[str, object],
+        data: Mapping[str, object],
+        kwargs: Mapping[str, object] = {},
     ) -> BatchFeature:
+        """
+        Call :code:`hf_processor` on the prompt :code:`data`
+        (text, image, audio...) with configurable options :code:`kwargs`.
+        """
         assert callable(hf_processor)
 
         base_kwargs = self.model_config.mm_processor_kwargs
@@ -144,21 +150,15 @@ class InputProcessingContext(InputContext):
 
         merged_kwargs = resolve_mm_processor_kwargs(
             base_kwargs,
-            inference_kwargs,
+            kwargs,
             hf_processor,
             requires_kw_only=False,
             allow_var_kwargs=True,
         )
 
         try:
-            return hf_processor(
-                text=prompt,
-                **processor_data,
-                **merged_kwargs,
-                return_tensors="pt",
-            )
+            return hf_processor(**data, **merged_kwargs, return_tensors="pt")
         except Exception as exc:
-            data = dict(text=prompt, **processor_data)
             msg = (f"Failed to apply {type(hf_processor).__name__} "
                    f"on data={data} with kwargs={merged_kwargs}")
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 0662d90e79b92..0ecba5a1cae0f 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,5 +1,4 @@
 from functools import cached_property
-from types import MethodType
 from typing import (Iterable, List, Literal, Mapping, Optional, Protocol, Set,
                     Tuple, TypedDict, Union)
 
@@ -7,7 +6,7 @@ import torch
 import torch.nn as nn
 from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig,
                           PixtralVisionConfig, PretrainedConfig,
-                          ProcessorMixin, SiglipVisionConfig)
+                          SiglipVisionConfig)
 from transformers.models.llava import LlavaProcessor
 from transformers.models.pixtral import PixtralProcessor
 
@@ -21,10 +20,12 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalDataItems,
+                                    MultiModalFieldConfig, MultiModalInputsV2,
+                                    MultiModalKwargs, NestedTensors)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
-                                        PromptReplacement)
+                                        ProcessorInputs, PromptReplacement,
+                                        full_groupby_modality)
 from vllm.sequence import IntermediateTensors
 
 from .clip import (CLIPVisionModel, dummy_image_for_clip,
@@ -116,36 +117,54 @@ def get_max_llava_image_tokens(ctx: InputContext):
 
 class LlavaMultiModalProcessor(BaseMultiModalProcessor):
 
-    def _patch_pixtral_processor(self, hf_processor: PixtralProcessor):
-        if getattr(hf_processor, "__is_patched__", False):
-            return  # Already patched
-
-        image_processor = hf_processor.image_processor  # type: ignore
-        orig_preprocess = image_processor.preprocess
-
-        def preprocess(__self, *args, **kwargs):
-            hf_inputs = orig_preprocess(*args, **kwargs)
-            hf_inputs["is_pixtral"] = torch.tensor(True)
-            return hf_inputs
-
-        image_processor.preprocess = MethodType(preprocess, image_processor)
-
-        hf_processor.__is_patched__ = True  # type: ignore
-
     def _get_hf_processor(self) -> Union[LlavaProcessor, PixtralProcessor]:
-        hf_processor = self.ctx.get_hf_processor(
-            (LlavaProcessor, PixtralProcessor))
+        return self.ctx.get_hf_processor((LlavaProcessor, PixtralProcessor))
 
-        if isinstance(hf_processor, PixtralProcessor):
-            self._patch_pixtral_processor(hf_processor)
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
 
-        return hf_processor
+        # NOTE: pixel_values=None for MLlavaProcessor
+        pixel_values = processed_outputs.get("pixel_values")
+        if pixel_values is not None:
+            images = mm_data["images"]
+            assert isinstance(images, list)
+
+            if isinstance(self._get_hf_processor(), PixtralProcessor):
+                # Original output: (1, num_images, C, H, W)
+                # New output: (num_images, C, H, W)
+                assert (isinstance(pixel_values, list)
+                        and len(pixel_values) == 1
+                        and isinstance(pixel_values[0], list)
+                        and len(pixel_values[0]) == len(images))
+
+                processed_outputs["pixel_values"] = pixel_values[0]
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
 
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
-        hf_inputs: BatchFeature,
-        mm_processor_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
         hf_config = self.ctx.get_hf_config(LlavaConfig)
         image_token_id = hf_config.image_token_index
@@ -200,7 +219,7 @@ class LlavaMultiModalProcessor(BaseMultiModalProcessor):
     ) -> ProcessorInputs:
         hf_config = self.ctx.get_hf_config(LlavaConfig)
         vision_config = hf_config.vision_config
-        num_images = mm_counts["image"]
+        num_images = mm_counts.get("image", 0)
 
         if isinstance(vision_config, CLIPVisionConfig):
             data = dummy_image_for_clip(vision_config, num_images)
@@ -218,7 +237,6 @@ class LlavaMultiModalProcessor(BaseMultiModalProcessor):
         return ProcessorInputs(
             prompt_text=image_token * num_images,
             mm_data=data,
-            mm_processor_kwargs={},
         )
 
 
@@ -379,7 +397,6 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[LlavaImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
-        is_pixtral = kwargs.pop("is_pixtral", torch.tensor([False]))
         image_embeds = kwargs.pop("image_embeds", None)
 
         if pixel_values is None and image_embeds is None:
@@ -390,33 +407,6 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
-            assert isinstance(is_pixtral, torch.Tensor)
-            if is_pixtral.any():
-                images = pixel_values
-
-                def flatten_to_3d_tensors(item):
-                    if isinstance(item, torch.Tensor):
-                        if item.dim() >= 3:
-                            return [t for t in item.view(-1, *item.shape[-3:])]
-                        else:
-                            raise ValueError(
-                                f"Unexpected tensor dimension: {item.dim()}")
-                    elif isinstance(item, list):
-                        return [
-                            t for subitem in item
-                            for t in flatten_to_3d_tensors(subitem)
-                        ]
-                    else:
-                        raise ValueError(f"Unexpected type: {type(item)}")
-
-                # Restructure the batched images into a list of lists of images
-                images = flatten_to_3d_tensors(pixel_values)
-
-                return LlavaImagePixelInputs(
-                    type="pixel_values",
-                    data=images,
-                )
-
             return LlavaImagePixelInputs(
                 type="pixel_values",
                 data=self._validate_pixel_values(
@@ -586,19 +576,71 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
 
 class MantisMultiModalProcessor(LlavaMultiModalProcessor):
 
-    def _get_hf_processor(self) -> ProcessorMixin:
-        try:
-            from mantis.models.mllava import MLlavaProcessor
-        except ModuleNotFoundError as exc:
-            raise ModuleNotFoundError(
-                "You need to `pip install "
-                "git+https://github.com/TIGER-AI-Lab/Mantis.git` "
-                "to use this model") from exc
+    def _get_hf_processor(self):
+        return self.ctx.get_hf_processor(LlavaProcessor)
 
-        processor = MLlavaProcessor.from_pretrained(
-            self.ctx.model_config.tokenizer)
-        assert isinstance(processor, ProcessorMixin)
-        return processor
+    def apply(
+        self,
+        prompt_text: str,
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        hf_config = self.ctx.get_hf_config(LlavaConfig)
+        image_token_id = hf_config.image_token_index
+        max_image_tokens = get_max_llava_image_tokens(self.ctx)
+
+        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+
+        mm_items = self._get_mm_items(mm_data)
+        mm_item_counts = mm_items.get_item_counts()
+        mm_kwargs = result["mm_kwargs"]
+
+        # We reimplement the functionality of MLlavaProcessor from
+        # https://github.com/TIGER-AI-Lab/Mantis.git
+        def get_replacement_mantis(item_idx: int):
+            return "".join([
+                f"(image {item_idx+1}: <Image>",  # 7 tokens
+                "<image>" * max_image_tokens,
+                "</Image>)",  # 3 tokens
+            ])
+
+        mantis_repls = self._bind_prompt_replacements([
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id] * max_image_tokens,
+                replacement=get_replacement_mantis,
+            )
+        ])
+
+        prompt_ids, prompt_text, _ = self._apply_prompt_replacements(
+            result["prompt_token_ids"],
+            mantis_repls,
+            mm_item_counts,
+        )
+
+        unbound_orig_repls = self._get_prompt_replacements(
+            mm_items,
+            hf_processor_mm_kwargs,
+            mm_kwargs,
+        )
+        orig_repls = self._bind_prompt_replacements(unbound_orig_repls)
+
+        all_placeholders = self._find_placeholders(orig_repls, prompt_ids,
+                                                   mm_item_counts)
+        assert len(all_placeholders) == mm_item_counts.get("image", 0)
+
+        mm_placeholders = {
+            modality: [item.to_range() for item in items]
+            for modality, items in full_groupby_modality(all_placeholders)
+        }
+
+        return MultiModalInputsV2(
+            type="multimodal",
+            prompt=prompt_text,
+            prompt_token_ids=prompt_ids,
+            mm_kwargs=mm_kwargs,
+            mm_placeholders=mm_placeholders,
+        )
 
 
 # To use this model, please use
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 4e2e7f5761544..fefa9fd62d1d0 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -12,9 +12,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union)
+from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -32,10 +32,14 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalDataItems,
+                                    MultiModalFieldConfig, MultiModalInputsV2,
+                                    MultiModalKwargs, NestedTensors,
+                                    PlaceholderRange)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
-                                        PromptReplacement)
+                                        ProcessorInputs, PromptReplacement,
+                                        _BoundPromptReplacement,
+                                        _PlaceholderInfo)
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
@@ -306,11 +310,11 @@ def get_max_phi3v_image_tokens(
     *,
     num_crops: Optional[int] = None,
 ) -> int:
-    mm_processor_kwargs = {}
+    hf_processor_mm_kwargs = {}
     if num_crops:
-        mm_processor_kwargs["num_crops"] = num_crops
+        hf_processor_mm_kwargs["num_crops"] = num_crops
 
-    processor = ctx.get_hf_processor(**mm_processor_kwargs)
+    processor = ctx.get_hf_processor(**hf_processor_mm_kwargs)
 
     return processor.calc_num_image_tokens_from_image_size(
         width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
@@ -331,39 +335,50 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor):
 
     def _call_hf_processor(
         self,
-        hf_processor: ProcessorMixin,
         prompt: str,
-        processor_data: Mapping[str, object],
-        mm_processor_kwargs: Mapping[str, object],
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
         processed_outputs = super()._call_hf_processor(
-            hf_processor,
             prompt=prompt,
-            processor_data=processor_data,
-            mm_processor_kwargs=mm_processor_kwargs,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
         )
 
+        input_ids = processed_outputs["input_ids"]
+        assert isinstance(input_ids, torch.Tensor)
+
         # Phi3v processor has inserted -1, -2 etc as placeholder in prompt_ids,
         # which will cause OverflowError when decoding the prompt_ids.
         # Therefore, we need to do an early replacement here
-        token_ids = processed_outputs['input_ids']
-        token_ids[token_ids < 0] = _IMAGE_TOKEN_ID
-        processed_outputs['input_ids'] = token_ids
+        input_ids.masked_fill_(input_ids < 0, _IMAGE_TOKEN_ID)
 
         return processed_outputs
 
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_sizes=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
-        hf_inputs: BatchFeature,
-        mm_processor_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
         hf_processor = self._get_hf_processor()
         image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
         image_processor = hf_processor.image_processor  # type: ignore
 
-        mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.limit_per_prompt.get("image", 1)
+        tokenizer = self._get_tokenizer()
+        bos_token_id = tokenizer.bos_token_id
+        assert isinstance(bos_token_id, int)
 
         def get_replacement_phi3v(item_idx: int):
             image_size = mm_items.get_image_size(item_idx)
@@ -372,21 +387,44 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor):
                 height=image_size.height,
             )
 
-            return [_IMAGE_TOKEN_ID] * num_tokens
+            return [_IMAGE_TOKEN_ID] * num_tokens + [bos_token_id]
 
         return [
             PromptReplacement(
                 modality="image",
                 target=image_token,
                 replacement=get_replacement_phi3v,
-            ) for image_token in image_tokens[:max_images]
+            ) for image_token in image_tokens[:len(mm_items.images)]
         ]
 
+    def _apply_prompt_replacements(
+        self,
+        token_ids: list[int],
+        prompt_repls: Sequence[_BoundPromptReplacement],
+        mm_item_counts: Mapping[str, int],
+    ) -> tuple[list[int], str, list[_PlaceholderInfo]]:
+        token_ids, text, placeholders = super()._apply_prompt_replacements(
+            token_ids=token_ids,
+            prompt_repls=prompt_repls,
+            mm_item_counts=mm_item_counts,
+        )
+
+        # Keep the behavior in line with HF processor
+        if text.startswith("<s> <|image|>"):
+            text = text.replace("<s> <|image|>", "<s><|image|>", 1)
+            token_ids = [token_ids[0], *token_ids[2:]]
+            placeholders = [
+                _PlaceholderInfo(p.modality, p.start_idx - 1, p.replacement)
+                for p in placeholders
+            ]
+
+        return token_ids, text, placeholders
+
     def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        num_images = mm_counts["image"]
+        num_images = mm_counts.get("image", 0)
 
         data = dummy_image_for_clip(
             CLIP_VIT_LARGE_PATCH14_336_CONFIG,
@@ -401,9 +439,28 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor):
         return ProcessorInputs(
             prompt_text="".join(image_tokens[:num_images]),
             mm_data=data,
-            mm_processor_kwargs={},
         )
 
+    def apply(
+        self,
+        prompt_text: str,
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+
+        # Only <|image|> tokens should be considered as placeholders,
+        # so we ignore the trailing bos_token_id
+        result["mm_placeholders"] = {
+            modality: [
+                PlaceholderRange(offset=p["offset"], length=p["length"] - 1)
+                for p in ps
+            ]
+            for modality, ps in result["mm_placeholders"].items()
+        }
+
+        return result
+
 
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens)
 @MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor)
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 63d1374ab4092..baf955f6b515d 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -225,7 +225,7 @@ class VisualAttentionBlock(nn.Module):
         d_model: int,
         n_head: int,
         mlp_ratio: float = 4.0,
-        norm_layer: Callable = nn.LayerNorm,
+        norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
@@ -266,7 +266,7 @@ class TransformerBlock(nn.Module):
         layers: int,
         heads: int,
         mlp_ratio: float = 4.0,
-        norm_layer: Callable = nn.LayerNorm,
+        norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 6259166a7fc57..25a351bd9c656 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -26,7 +26,7 @@ from typing import (Any, Iterable, List, Mapping, Optional, Set, Tuple,
 import numpy as np
 import torch
 import torch.nn as nn
-from transformers import BatchFeature, ProcessorMixin
+from transformers import BatchFeature
 from transformers.models.qwen2_audio import (Qwen2AudioConfig,
                                              Qwen2AudioEncoder,
                                              Qwen2AudioProcessor)
@@ -38,10 +38,10 @@ from vllm.inputs import InputContext
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
+from vllm.multimodal.inputs import (MultiModalDataItems, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
-                                        PromptReplacement)
+                                        ProcessorInputs, PromptReplacement)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -73,7 +73,7 @@ class Qwen2AudioMultiModalProjector(nn.Module):
 
 
 # From Qwen2AudioEncoder._get_feat_extract_output_lengths
-def _get_feat_extract_output_lengths(input_lengths: torch.LongTensor):
+def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
     feat_lengths = (input_lengths - 1) // 2 + 1
     output_lengths = (feat_lengths - 2) // 2 + 1
     return feat_lengths, output_lengths
@@ -88,13 +88,18 @@ def get_max_qwen2_audio_audio_tokens(ctx: InputContext) -> int:
 
 class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor):
 
-    def _get_hf_processor(self) -> Qwen2AudioProcessor:
+    def _get_hf_processor(
+        self,
+        *,
+        # Ignored in initialization
+        sampling_rate: Optional[int] = None,
+    ) -> Qwen2AudioProcessor:
         return self.ctx.get_hf_processor(Qwen2AudioProcessor)
 
     def _get_feature_extractor(self) -> WhisperFeatureExtractor:
         return self._get_hf_processor().feature_extractor  # type: ignore
 
-    def _get_processor_data(
+    def _get_hf_mm_data(
         self,
         mm_items: MultiModalDataItems,
     ) -> tuple[dict[str, Any], dict[str, Any]]:
@@ -102,50 +107,61 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor):
         feature_extractor = self._get_feature_extractor()
         mm_items.resample_audios(feature_extractor.sampling_rate)
 
-        return super()._get_processor_data(mm_items)
+        return super()._get_hf_mm_data(mm_items)
 
     def _call_hf_processor(
         self,
-        hf_processor: ProcessorMixin,
         prompt: str,
-        processor_data: Mapping[str, object],
-        mm_processor_kwargs: Mapping[str, object],
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        processor_data = dict(processor_data)
-        audios = processor_data.pop("audios", [])
+        mm_data = dict(mm_data)
+        audios = mm_data.pop("audios", [])
 
         if audios:
-            processor_data["audios"] = audios
+            mm_data["audios"] = audios
 
             feature_extractor = self._get_feature_extractor()
-            mm_processor_kwargs = dict(
-                **mm_processor_kwargs,
+            mm_kwargs = dict(
+                **mm_kwargs,
                 sampling_rate=feature_extractor.sampling_rate,
             )
         else:
             # NOTE: WhisperFeatureExtractor cannot handle empty list of audios
             pass
 
-        return super()._call_hf_processor(
-            hf_processor,
+        processed_outputs = super()._call_hf_processor(
             prompt=prompt,
-            processor_data=processor_data,
-            mm_processor_kwargs=mm_processor_kwargs,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            input_features=MultiModalFieldConfig.batched("audio"),
+            feature_attention_mask=MultiModalFieldConfig.batched("audio"),
         )
 
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
-        hf_inputs: BatchFeature,
-        mm_processor_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
         hf_config = self.ctx.get_hf_config(Qwen2AudioConfig)
         placeholder = hf_config.audio_token_index
 
-        feature_attention_mask = hf_inputs.get("feature_attention_mask")
+        feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
         if feature_attention_mask is None:
             audio_output_lengths = []
         else:
+            assert isinstance(feature_attention_mask, torch.Tensor)
             _, audio_output_lengths = _get_feat_extract_output_lengths(
                 feature_attention_mask.sum(-1))
 
@@ -168,14 +184,13 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor):
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
 
-        audio_count = mm_counts["audio"]
+        audio_count = mm_counts.get("audio", 0)
         audio = np.zeros(audio_len)
         data = {"audio": [audio] * audio_count}
 
         return ProcessorInputs(
             prompt_text="<|AUDIO|>" * audio_count,
             mm_data=data,
-            mm_processor_kwargs={},
         )
 
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index fb97eb1916002..574845ef5a525 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -22,9 +22,10 @@
 # limitations under the License.
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
 from functools import cached_property, partial
-from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set,
-                    Tuple, Type, TypedDict, Union)
+from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional,
+                    Set, Tuple, Type, TypedDict, Union)
 
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -54,10 +55,11 @@ from vllm.model_executor.layers.quantization.gptq_marlin import (
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalDataDict, NestedTensors
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalDataItems,
+                                    MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
-                                        PromptReplacement)
+                                        ProcessorInputs, PromptReplacement)
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
@@ -229,9 +231,9 @@ class Qwen2VisionAttention(nn.Module):
 
     def __init__(
         self,
-        embed_dim: Optional[int] = None,
-        num_heads: Optional[int] = None,
-        projection_size: Optional[int] = None,
+        embed_dim: int,
+        num_heads: int,
+        projection_size: int,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ) -> None:
@@ -264,7 +266,7 @@ class Qwen2VisionAttention(nn.Module):
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor = None,
+        rotary_pos_emb: torch.Tensor,
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
@@ -347,7 +349,7 @@ class Qwen2VisionBlock(nn.Module):
         num_heads: int,
         mlp_ratio: float,
         act_layer: Type[nn.Module] = QuickGELU,
-        norm_layer: Type[nn.Module] = None,
+        norm_layer: Optional[Callable[[int], nn.Module]] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ) -> None:
@@ -384,7 +386,7 @@ class Qwen2VisionPatchEmbed(nn.Module):
         self,
         patch_size: int = 14,
         temporal_patch_size: int = 2,
-        in_chans: int = 3,
+        in_channels: int = 3,
         embed_dim: int = 1152,
     ) -> None:
         super().__init__()
@@ -392,8 +394,8 @@ class Qwen2VisionPatchEmbed(nn.Module):
         self.temporal_patch_size = temporal_patch_size
         self.embed_dim = embed_dim
 
-        kernel_size = [temporal_patch_size, patch_size, patch_size]
-        self.proj = nn.Conv3d(in_chans,
+        kernel_size = (temporal_patch_size, patch_size, patch_size)
+        self.proj = nn.Conv3d(in_channels,
                               embed_dim,
                               kernel_size=kernel_size,
                               stride=kernel_size,
@@ -413,7 +415,7 @@ class Qwen2VisionPatchMerger(nn.Module):
         self,
         d_model: int,
         context_dim: int,
-        norm_layer: Type[nn.Module] = None,
+        norm_layer: Optional[Callable[[int], nn.Module]] = None,
         spatial_merge_size: int = 2,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -489,15 +491,15 @@ class Qwen2VisionTransformer(nn.Module):
     ) -> None:
         super().__init__()
 
-        patch_size: int = vision_config.patch_size
-        temporal_patch_size: int = vision_config.temporal_patch_size
-        spatial_merge_size: int = vision_config.spatial_merge_size
-        in_chans: int = vision_config.in_chans
-        hidden_size: int = vision_config.hidden_size
-        embed_dim: int = vision_config.embed_dim
-        depth: int = vision_config.depth
-        num_heads: int = vision_config.num_heads
-        mlp_ratio: float = vision_config.mlp_ratio
+        patch_size = vision_config.patch_size
+        temporal_patch_size = vision_config.temporal_patch_size
+        spatial_merge_size = vision_config.spatial_merge_size
+        in_channels = vision_config.in_channels
+        hidden_size = vision_config.hidden_size
+        embed_dim = vision_config.embed_dim
+        depth = vision_config.depth
+        num_heads = vision_config.num_heads
+        mlp_ratio = vision_config.mlp_ratio
 
         self.spatial_merge_size = spatial_merge_size
         self.num_heads = num_heads
@@ -506,7 +508,7 @@ class Qwen2VisionTransformer(nn.Module):
         self.patch_embed = Qwen2VisionPatchEmbed(
             patch_size=patch_size,
             temporal_patch_size=temporal_patch_size,
-            in_chans=in_chans,
+            in_channels=in_channels,
             embed_dim=embed_dim,
         )
 
@@ -733,8 +735,12 @@ class Qwen2VLMultiModalDataItems(MultiModalDataItems):
             if k == "video":
                 # Special case since even a single item can be a list
                 multi_data[k] = (  # type: ignore[index]
-                    v if (isinstance(v, (dict, torch.Tensor))  # type: ignore[assignment]
-                          or is_list_of(v, list)) else [v]
+                    v if (
+                        isinstance(v, (dict, torch.Tensor))  # type: ignore[assignment]
+                        or is_list_of(v, list)
+                        or isinstance(v[0], (np.ndarray, torch.Tensor))
+                           and v[0].ndim == 4
+                    ) else [v]
                 )
             elif k in ("image", "audio"):
                 multi_data[k] = (  # type: ignore[index]
@@ -754,6 +760,12 @@ class Qwen2VLMultiModalDataItems(MultiModalDataItems):
             for m, items in self.items()
         }
 
+    def has_embedding_inputs(self) -> bool:
+        return any(
+            isinstance(items, dict) or any(
+                isinstance(item, torch.Tensor) for item in items)
+            for items in self.values())
+
 
 class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
 
@@ -784,7 +796,7 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
 
         return hf_processor
 
-    def _get_processor_data(
+    def _get_hf_mm_data(
         self,
         mm_items: MultiModalDataItems,
     ) -> tuple[dict[str, Any], dict[str, Any]]:
@@ -805,7 +817,7 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
                       and v[0].ndim == 2):
                     # Pass through embedding inputs (multi)
                     passthrough_data[f"{k}_embeds"] = v
-                else:
+                elif len(v) > 0:
                     # Map keys to plural form, e.g.: image -> images
                     processor_data[f"{k}s"] = v
             else:
@@ -816,8 +828,8 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
-        hf_inputs: BatchFeature,
-        mm_processor_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
         hf_processor = self._get_hf_processor()
         image_processor = _get_image_processor(hf_processor)
@@ -831,7 +843,9 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
         merge_length = image_processor.merge_size**2
 
         def get_replacement_qwen2vl(item_idx: int, modality: str):
-            grid_thw = hf_inputs[f"{modality}_grid_thw"][item_idx]
+            grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx]
+            assert isinstance(grid_thw, torch.Tensor)
+
             num_tokens = grid_thw.prod() // merge_length
             return placeholder[modality] * num_tokens
 
@@ -844,11 +858,40 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
             ) for modality in ("image", "video")
         ]
 
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
+        image_slice_idxs = [0] + image_grid_thw.prod(-1).cumsum_(0).tolist()
+        image_slices = [
+            slice(image_slice_idxs[i], image_slice_idxs[i + 1])
+            for i in range(len(image_grid_thw))
+        ]
+
+        video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
+        video_slice_idxs = [0] + video_grid_thw.prod(-1).cumsum_(0).tolist()
+        video_slices = [
+            slice(video_slice_idxs[i], video_slice_idxs[i + 1])
+            for i in range(len(video_grid_thw))
+        ]
+
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat("image", image_slices),
+            image_embeds=MultiModalFieldConfig.flat("image", image_slices),
+            image_grid_thw=MultiModalFieldConfig.batched("image"),
+            pixel_values_videos=MultiModalFieldConfig.flat(
+                "video", video_slices),
+            video_embeds=MultiModalFieldConfig.flat("video", video_slices),
+            video_grid_thw=MultiModalFieldConfig.batched("video"),
+        )
+
     def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        num_images = mm_counts["image"]
+        num_images = mm_counts.get("image", 0)
         hf_processor = self._get_hf_processor()
         image_token: str = hf_processor.image_token
         image_processor = _get_image_processor(hf_processor)
@@ -869,7 +912,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
         return ProcessorInputs(
             prompt_text=image_token * num_images,
             mm_data=data,
-            mm_processor_kwargs={},
         )
 
 
@@ -950,9 +992,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             return None
         return quant_config
 
-    def _validate_and_reshape_mm_tensor(self,
-                                        mm_input: Union[torch.Tensor,
-                                                        List[torch.Tensor]],
+    def _validate_and_reshape_mm_tensor(self, mm_input: object,
                                         name: str) -> torch.Tensor:
         if not isinstance(mm_input, (torch.Tensor, list)):
             raise ValueError(f"Incorrect type of {name}. "
@@ -962,7 +1002,8 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                 return mm_input
             if mm_input.ndim != 3:
                 raise ValueError(f"{name} should be 2D or batched 3D tensor. "
-                                 f"Got ndim: {mm_input.ndim}")
+                                 f"Got ndim: {mm_input.ndim} "
+                                 f"(shape={mm_input.shape})")
             return torch.concat(list(mm_input))
         else:
             return torch.concat(mm_input)
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 509ad9e580ddf..7b4aeeec5f403 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -23,10 +23,11 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.loader import DefaultModelLoader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataItems, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
-                                        PromptReplacement)
+                                        ProcessorInputs, PromptReplacement)
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 from vllm.utils import is_list_of
@@ -72,11 +73,19 @@ def get_ultravox_max_audio_tokens(ctx: InputContext):
 
 class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
 
+    def _get_hf_processor(
+        self,
+        *,
+        # Ignored in initialization
+        sampling_rate: Optional[int] = None,
+    ) -> ProcessorMixin:
+        return self.ctx.get_hf_processor()
+
     def _get_feature_extractor(self) -> WhisperFeatureExtractor:
         hf_processor = self._get_hf_processor()
         return hf_processor.audio_processor.feature_extractor  # type: ignore
 
-    def _get_processor_data(
+    def _get_hf_mm_data(
         self,
         mm_items: MultiModalDataItems,
     ) -> tuple[dict[str, Any], dict[str, Any]]:
@@ -84,33 +93,41 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
         feature_extractor = self._get_feature_extractor()
         mm_items.resample_audios(feature_extractor.sampling_rate)
 
-        return super()._get_processor_data(mm_items)
+        return super()._get_hf_mm_data(mm_items)
 
     def _call_hf_processor(
         self,
-        hf_processor: ProcessorMixin,
         prompt: str,
-        processor_data: Mapping[str, object],
-        mm_processor_kwargs: Mapping[str, object],
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        processor_data = dict(processor_data)
-        audios = processor_data.pop("audios", [])
+        # Text-only input not supported in composite processor
+        if not mm_data:
+            tokenizer = self._get_tokenizer()
+
+            prompt_ids = tokenizer.encode(
+                prompt,
+                add_special_tokens=False,  # type: ignore
+            )
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        mm_data = dict(mm_data)
+        audios = mm_data.pop("audios", [])
 
         if not audios:
             return super()._call_hf_processor(
-                hf_processor,
                 prompt=prompt,
-                processor_data=processor_data,
-                mm_processor_kwargs=mm_processor_kwargs,
+                mm_data=mm_data,
+                mm_kwargs=mm_kwargs,
             )
 
         feature_extractor = self._get_feature_extractor()
-        mm_processor_kwargs = dict(
-            **mm_processor_kwargs,
+        mm_kwargs = dict(
+            **mm_kwargs,
             sampling_rate=feature_extractor.sampling_rate,
         )
 
-        # Already resampled by _get_processor_data
+        # Already resampled by _get_hf_mm_data
         assert is_list_of(audios, np.ndarray)
 
         # Ultravox processor doesn't support multiple inputs,
@@ -119,13 +136,12 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
         shared_outputs = {}
         for audio in audios:
             # NOTE: Ultravox processor accepts "audio" instead of "audios"
-            item_processor_data = dict(**processor_data, audio=audio)
+            item_processor_data = dict(**mm_data, audio=audio)
 
             item_outputs = super()._call_hf_processor(
-                hf_processor,
                 prompt=prompt,
-                processor_data=item_processor_data,
-                mm_processor_kwargs=mm_processor_kwargs,
+                mm_data=item_processor_data,
+                mm_kwargs=mm_kwargs,
             )
 
             audio_features.append(item_outputs.pop("audio_values")[0])
@@ -139,17 +155,28 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
         )
         return BatchFeature(combined_outputs)
 
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            audio_features=MultiModalFieldConfig.batched("audio"),
+            audio_token_len=MultiModalFieldConfig.batched("audio"),
+            audio_embeds=MultiModalFieldConfig.batched("audio"),
+        )
+
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
-        hf_inputs: BatchFeature,
-        mm_processor_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
         hf_processor = self._get_hf_processor()
         placeholder = hf_processor.audio_token_replacement  # type: ignore
 
         def get_replacement_ultravox(item_idx: int):
-            audio_token_len = hf_inputs["audio_token_len"][item_idx]
+            audio_token_len = out_mm_kwargs["audio_token_len"][item_idx]
             return placeholder * audio_token_len
 
         return [
@@ -168,14 +195,13 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
 
-        audio_count = mm_counts["audio"]
+        audio_count = mm_counts.get("audio", 0)
         audio = np.zeros(audio_len)
         data = {"audio": [audio] * audio_count}
 
         return ProcessorInputs(
             prompt_text="<|audio|>" * audio_count,
             mm_data=data,
-            mm_processor_kwargs={},
         )
 
 
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 10488e24b30cc..cdda6f8052794 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -297,35 +297,37 @@ class MultiModalPlaceholderMap:
         ``MultiModalPlaceholderMap`` that relates the multi-modal embedding
         vectors to their corresponding placeholders.
 
-        Consider the following scenarios:
+        Examples:
 
-           Prompt: |AAAA BBBB What's in these images?|
-        Positions: |.................................|
+        .. code-block::
 
-            images      = [A, B]
-            src_ranges  = [(0, 4), (4, 8)]
-            dest_ranges = [(0, 4), (5, 9)]
+            Prompt:    |AAAA BBBB What's in these images?|
+            Positions: |.................................|
 
-           Prompt: |AAAA BBBB What's in these images?|
-        Positions: |  .....                          |
+                images      = [A, B]
+                src_ranges  = [(0, 4), (4, 8)]
+                dest_ranges = [(0, 4), (5, 9)]
 
-            images      = [A, B]
-            src_ranges  = [(2, 4), (4, 6)]
-            dest_ranges = [(0, 2), (3, 5)]
+            Prompt:    |AAAA BBBB What's in these images?|
+            Positions: |  .....                          |
 
-           Prompt: |AAAA BBBB What's in these images?|
-        Positions: |     .........                   |
+                images      = [A, B]
+                src_ranges  = [(2, 4), (4, 6)]
+                dest_ranges = [(0, 2), (3, 5)]
 
-            images      = [B]
-            src_ranges  = [(0, 4)]
-            dest_ranges = [(0, 4)]
+            Prompt:    |AAAA BBBB What's in these images?|
+            Positions: |     .........                   |
 
-           Prompt: |AAAA BBBB What's in these images?|
-        Positions: |          .......................|
+                images      = [B]
+                src_ranges  = [(0, 4)]
+                dest_ranges = [(0, 4)]
 
-            images      = []
-            src_ranges  = []
-            dest_ranges = []
+            Prompt:    |AAAA BBBB What's in these images?|
+            Positions: |          .......................|
+
+                images      = []
+                src_ranges  = []
+                dest_ranges = []
         """
         seq_mm_data = seq_group.multi_modal_data
         seq_mm_placeholders = seq_group.multi_modal_placeholders
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 9ecae2c1ca2bf..1fbda6e0b8750 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -1,12 +1,16 @@
+from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
-from typing import (Any, Dict, List, Literal, Mapping, Sequence, Tuple,
-                    TypedDict, TypeVar, Union, cast, final)
+from collections.abc import Mapping, Sequence
+from dataclasses import dataclass
+from typing import (Any, Literal, NamedTuple, TypedDict, TypeVar, Union, cast,
+                    final)
 
 import numpy as np
 import torch
 import torch.types
 from PIL.Image import Image
-from typing_extensions import NotRequired, TypeAlias
+from transformers import BatchFeature
+from typing_extensions import NotRequired, TypeAlias, assert_never
 
 from vllm.utils import JSONTree, is_list_of, json_map_leaves
 
@@ -44,7 +48,7 @@ item, which can be passed to a HuggingFace :code:`AudioProcessor`.
 """
 # yapf: enable
 
-MultiModalData: TypeAlias = Union[_T, List[_T]]
+MultiModalData: TypeAlias = Union[_T, list[_T]]
 """
 Either a single data item, or a list of data items.
 
@@ -79,13 +83,135 @@ Note:
 """
 
 
+class ImageSize(NamedTuple):
+    width: int
+    height: int
+
+
+class MultiModalDataItems(UserDict[str, list[Any]]):
+    """
+    As :class:`MultiModalDataDict`, but normalized such that each entry
+    corresponds to a list.
+    """
+
+    @staticmethod
+    def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems":
+        """
+        Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
+        """
+        multi_data = MultiModalDataItems()
+
+        for k, v in data.items():
+            # TODO: Make a separate modality for embedding inputs
+            # to avoid confusion
+            # yapf: disable
+            if k == "video":
+                # Special case since even a single item can be a list
+                multi_data[k] = (  # type: ignore[index]
+                    v if (
+                        isinstance(v, torch.Tensor)
+                        or is_list_of(v, list)
+                        or isinstance(v[0], (np.ndarray, torch.Tensor))
+                           and v[0].ndim == 4
+                    ) else [v]
+                )
+            elif k in ("image", "audio"):
+                multi_data[k] = (  # type: ignore[index]
+                    v if isinstance(v, (torch.Tensor, list)) else [v]
+                )
+            else:
+                multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
+            # yapf: enable
+
+        return multi_data
+
+    # NOTE: When a field (e.g. `images`) doesn't exist, directly appending to
+    # `self.images` doesn't update this dictionary, which may be confusing
+    # We annotate the getter methods as `Sequence` to prevent others from
+    # trying to update the list in this way
+    @property
+    def images(self) -> Sequence[ImageItem]:
+        return self.get("image", [])
+
+    @property
+    def videos(self) -> Sequence[VideoItem]:
+        return self.get("video", [])
+
+    @property
+    def audios(self) -> Sequence[AudioItem]:
+        return self.get("audio", [])
+
+    def get_item_counts(self) -> Mapping[str, int]:
+        return {m: len(items) for m, items in self.items()}
+
+    def has_embedding_inputs(self) -> bool:
+        return any(
+            any(isinstance(item, torch.Tensor) for item in items)
+            for items in self.values())
+
+    def get_image_size(self, item_idx: int) -> ImageSize:
+        image = self.images[item_idx]
+
+        if isinstance(image, Image):
+            return ImageSize(*image.size)
+        if isinstance(image, (np.ndarray, torch.Tensor)):
+            _, h, w = image.shape
+            return ImageSize(w, h)
+
+        assert_never(image)
+
+    def get_audio_with_sr(
+        self,
+        item_idx: int,
+        *,
+        default_sr: float,
+    ) -> tuple[np.ndarray, float]:
+        audio = self.audios[item_idx]
+
+        if isinstance(audio, tuple):
+            return audio
+        if isinstance(audio, list):
+            return np.array(audio), default_sr
+        if isinstance(audio, np.ndarray):
+            return audio, default_sr
+
+        assert_never(audio)
+
+    def resample_audios(self, new_sr: float, *, drop_sr: bool = True) -> None:
+        """
+        If :code:`drop_sr=True`, the audio items in this dictionary are updated
+        to be NumPy arrays which implicitly means that their sampling rate is
+        the same as the model's expected sampling rate; otherwise, they remain
+        as :code:`(audio, new_sr)` tuples.
+        """
+        # Avoid circular import
+        from .audio import resample_audio
+
+        if not self.audios:
+            return
+
+        new_audios = []
+        for item_idx in range(len(self.audios)):
+            audio, sr = self.get_audio_with_sr(item_idx, default_sr=new_sr)
+            audio = resample_audio(audio, orig_sr=sr, target_sr=new_sr)
+
+            new_audios.append(audio if drop_sr else (audio, new_sr))
+
+        self["audio"] = new_audios
+
+
 class PlaceholderRange(TypedDict):
     """
     Placeholder location information for multi-modal data.
 
-    For example:
-        Prompt: AAAA BBBB What is in these images?
+    Example:
+
+        Prompt: :code:`AAAA BBBB What is in these images?`
+
         Images A and B will have:
+
+        .. code-block::
+
             A: { "offset": 0, "length": 4 }
             B: { "offset": 5, "length": 4 }
     """
@@ -97,25 +223,256 @@ class PlaceholderRange(TypedDict):
     """The length of the placeholder."""
 
 
-NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor,
-                      Tuple[torch.Tensor, ...]]
+NestedTensors = Union[list["NestedTensors"], list[torch.Tensor], torch.Tensor,
+                      tuple[torch.Tensor, ...]]
 """
 Uses a list instead of a tensor if the dimensions of each element do not match.
 """
 
-BatchedTensorInputs: TypeAlias = Dict[str, NestedTensors]
+
+def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
+    """Equality check between :data:`NestedTensors` objects."""
+    if isinstance(a, torch.Tensor):
+        return isinstance(b, torch.Tensor) and bool((a == b).all().item())
+    elif isinstance(b, torch.Tensor):
+        return isinstance(a, torch.Tensor) and bool((b == a).all().item())
+
+    if isinstance(a, list):
+        return (isinstance(b, list)
+                and all(nested_tensors_equal(a_, b_) for a_, b_ in zip(a, b)))
+    if isinstance(b, list):
+        return (isinstance(a, list)
+                and all(nested_tensors_equal(b_, a_) for b_, a_ in zip(b, a)))
+
+    # Both a and b are scalars
+    return a == b
+
+
+BatchedTensorInputs: TypeAlias = Mapping[str, NestedTensors]
 """
 A dictionary containing nested tensors which have been batched via
 :meth:`MultiModalKwargs.batch`.
 """
 
 
+@dataclass(frozen=True)
+class MultiModalFieldItem:
+    """
+    Contains metadata and data in :class:`MultiModalKwargs`
+    corresponding to a data item in :class:`MultiModalDataItems`.
+    """
+    field: "BaseMultiModalField"
+    data: NestedTensors
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, self.__class__):
+            return False
+
+        return (self.field == other.field
+                and nested_tensors_equal(self.data, other.data))
+
+
+@dataclass(frozen=True)
+class BaseMultiModalField(ABC):
+    """Abstract base class for a field in :class:`MultiModalKwargs`."""
+    key: str
+    modality: str
+
+    @abstractmethod
+    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+        raise NotImplementedError
+
+    def _build_item(self, data: NestedTensors) -> MultiModalFieldItem:
+        return MultiModalFieldItem(self, data)
+
+    def reduce(self, batch: list[MultiModalFieldItem]) -> MultiModalFieldItem:
+        """Merge multiple instances of :class:`MultiModalFieldItem` together."""
+        fields = [item.field for item in batch]
+        if len(set(fields)) > 1:
+            raise ValueError(f"Cannot merge different {fields=}")
+
+        data = self._reduce_data([item.data for item in batch])
+
+        return self._build_item(data)
+
+
+@dataclass(frozen=True)
+class MultiModalBatchedField(BaseMultiModalField):
+    """
+    A :class:`BaseMultiModalField` implementation where an item is obtained by
+    directly indexing into the first dimension of the underlying data.
+    """
+
+    def build_items(self, batch: NestedTensors) -> list[MultiModalFieldItem]:
+        return [self._build_item(item) for item in batch]
+
+    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+        if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
+            first_shape = batch[0].shape
+            if all(item.shape == first_shape for item in batch):
+                return torch.stack(batch)
+
+        return batch
+
+
+@dataclass(frozen=True)
+class MultiModalFlatField(BaseMultiModalField):
+    """
+    A :class:`BaseMultiModalField` implementation where an item is obtained by
+    slicing along the first dimension of the underlying data.
+    """
+
+    def build_items(
+        self,
+        batch: NestedTensors,
+        slices: Sequence[slice],
+    ) -> list[MultiModalFieldItem]:
+        return [self._build_item(batch[slice_]) for slice_ in slices]
+
+    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+        if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
+            first_shape = batch[0].shape
+            if all(item.shape[1:] == first_shape[1:] for item in batch):
+                return torch.concat(batch)
+
+        return [elem for item in batch for elem in item]
+
+
+class MultiModalFieldConfig:
+
+    @staticmethod
+    def batched(modality: str):
+        return MultiModalFieldConfig(
+            field_cls=MultiModalBatchedField,
+            modality=modality,
+        )
+
+    @staticmethod
+    def flat(modality: str, slices: Sequence[slice]):
+        return MultiModalFieldConfig(
+            field_cls=MultiModalFlatField,
+            modality=modality,
+            slices=slices,
+        )
+
+    def __init__(
+        self,
+        field_cls: type[BaseMultiModalField],
+        modality: str,
+        **field_config: Any,
+    ) -> None:
+        super().__init__()
+
+        self._field_cls = field_cls
+        self._modality = modality
+        self._field_config = field_config
+
+    def build_items(
+        self,
+        key: str,
+        batch: NestedTensors,
+    ) -> list[MultiModalFieldItem]:
+        field = self._field_cls(key=key, modality=self._modality)
+        return field.build_items(batch, **self._field_config)  # type: ignore
+
+
 class MultiModalKwargs(UserDict[str, NestedTensors]):
     """
     A dictionary that represents the keyword arguments to
     :meth:`~torch.nn.Module.forward`.
+
+    The metadata :code:`items_by_key` defines how to split batched keyword
+    arguments corresponding to each data item in :class:`MultiModalDataItems`:
+
+    - For a keyword argument, we can access the :code:`i` th item in the batch
+      via :code:`items_by_key[key][i]`.
+    - We can gather the keyword arguments belonging to a modality by finding
+      the keys with items that belong to that modality, then accessing
+      the :code:`i` th item in the batch for each such key.
+
+    Example:
+
+        .. code-block:: python
+
+            # All items belong to the "image" modality
+            items_by_key={
+                "pixel_values": [a, b, c, d],  # "image" modality
+                "image_grid_thw": [e, f, g, h],  # "image" modality
+                "pixel_values_video": [h, i, j],  # "video" modality
+                "video_grid_thw": [k, l, m],  # "video" modality
+            }
+
+        - The keyword arguments belonging to the first image are
+          :code:`{"pixel_values": a, "image_grid_thw": e}`.
+        - The keyword arguments belonging to the second video are
+          :code:`{"pixel_values_video": i, "video_grid_thw": l}`.
     """
 
+    @staticmethod
+    def from_hf_inputs(
+        hf_inputs: BatchFeature,
+        config_by_key: Mapping[str, MultiModalFieldConfig],
+        *,
+        enable_sanity_checks: bool = False,
+    ):
+        # NOTE: This skips fields in `hf_inputs` that are not in `config_by_key`
+        # We assume that those fields are not used in vLLM
+        items_by_key = {
+            key: config.build_items(key, batch)
+            for key, config in config_by_key.items()
+            if (batch := hf_inputs.get(key)) is not None
+        }
+
+        return MultiModalKwargs.from_items_by_key(
+            items_by_key,
+            enable_sanity_checks=enable_sanity_checks,
+        )
+
+    @staticmethod
+    def from_items_by_key(
+        items_by_key: Mapping[str, list[MultiModalFieldItem]],
+        *,
+        enable_sanity_checks: bool = False,
+    ) -> "MultiModalKwargs":
+        data = {
+            key: items[0].field.reduce(items).data
+            for key, items in items_by_key.items()
+        }
+
+        return MultiModalKwargs(data,
+                                items_by_key=items_by_key,
+                                enable_sanity_checks=enable_sanity_checks)
+
+    def __init__(
+        self,
+        data: Mapping[str, NestedTensors],
+        *,
+        items_by_key: Mapping[str, list[MultiModalFieldItem]] = {},
+        enable_sanity_checks: bool = False,
+    ) -> None:
+        super().__init__(data)
+
+        # Shallow copy to avoid footgun in case a defaultdict is passed in
+        self._items_by_key = dict(items_by_key)
+
+        keys_by_modality = defaultdict[str, set[str]](set)
+        for key, items in items_by_key.items():
+            for item in items:
+                keys_by_modality[item.field.modality].add(key)
+
+        self._keys_by_modality = dict(keys_by_modality)
+
+        if enable_sanity_checks:
+            for modality, keys in keys_by_modality.items():
+                items_in_modality = {k: items_by_key[k] for k in keys}
+                batch_sizes = {k: len(v) for k, v in items_in_modality.items()}
+                batch_size = next(iter(batch_sizes.values()), 0)
+                assert all(bs == batch_size
+                           for bs in batch_sizes.values()), dict(
+                               modality=modality,
+                               batch_sizes=batch_sizes,
+                               items_by_key=items_by_key)
+
     @staticmethod
     def _try_stack(nested_tensors: NestedTensors) -> NestedTensors:
         """
@@ -139,7 +496,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
             # Only tensors (not lists) can be stacked.
             return stacked
 
-        tensors_ = cast(List[torch.Tensor], stacked)
+        tensors_ = cast(list[torch.Tensor], stacked)
         if any(t.shape != tensors_[0].shape for t in tensors_):
             # The tensors have incompatible shapes and can't be stacked.
             return tensors_
@@ -147,7 +504,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
         return torch.stack(tensors_)
 
     @staticmethod
-    def batch(inputs_list: List["MultiModalKwargs"]) -> BatchedTensorInputs:
+    def batch(inputs_list: list["MultiModalKwargs"]) -> BatchedTensorInputs:
         """
         Batch multiple inputs together into a dictionary.
 
@@ -162,7 +519,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
 
         # We need to consider the case where each item in the batch
         # contains different modalities (i.e. different keys).
-        item_lists: Dict[str, List[NestedTensors]] = defaultdict(list)
+        item_lists = defaultdict[str, list[NestedTensors]](list)
 
         for inputs in inputs_list:
             for k, v in inputs.items():
@@ -188,6 +545,57 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
 
         return cast(BatchedTensorInputs, json_mapped)
 
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, self.__class__):
+            return False
+        if self._items_by_key != other._items_by_key:
+            return False
+
+        ks = self.keys()
+        return (ks == other.keys()
+                and all(nested_tensors_equal(self[k], other[k]) for k in ks))
+
+    def get_item(self, key: str, item_index: int) -> MultiModalFieldItem:
+        return self._items_by_key[key][item_index]
+
+    def get_items_by_modality(
+        self,
+        modality: str,
+        item_index: int,
+    ) -> Mapping[str, MultiModalFieldItem]:
+        """
+        Get the keyword arguments corresponding to an item identified by
+        its modality and index.
+        """
+        keys_to_gather = self._keys_by_modality[modality]
+
+        return {
+            key: self.get_item(key, item_index)
+            for key in keys_to_gather if key in self
+        }
+
+    @staticmethod
+    def from_items_by_modality(
+        items_by_modality: Mapping[str, list[Mapping[str,
+                                                     MultiModalFieldItem]]],
+        *,
+        enable_sanity_checks: bool = False,
+    ) -> "MultiModalKwargs":
+        """
+        Construct a new :class:`MultiModalKwargs` from multiple items returned
+        by :meth:`get_fields_by_modality`.
+        """
+        items_by_key = defaultdict[str, list[MultiModalFieldItem]](list)
+        for fields in items_by_modality.values():
+            for field in fields:
+                for k, v in field.items():
+                    items_by_key[k].append(v)
+
+        return MultiModalKwargs.from_items_by_key(
+            items_by_key,
+            enable_sanity_checks=enable_sanity_checks,
+        )
+
 
 MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]]
 """
@@ -207,16 +615,16 @@ class MultiModalInputsV2(TypedDict):
     prompt: str
     """The processed prompt text."""
 
-    prompt_token_ids: List[int]
+    prompt_token_ids: list[int]
     """The processed token IDs which includes placeholder tokens."""
 
-    token_type_ids: NotRequired[List[int]]
+    token_type_ids: NotRequired[list[int]]
     """The token type IDs of the prompt."""
 
     mm_kwargs: MultiModalKwargs
     """Keyword arguments to be directly passed to the model after batching."""
 
-    mm_hashes: NotRequired[List[str]]
+    mm_hashes: NotRequired[list[str]]
     """The hashes of the multi-modal data."""
 
     mm_placeholders: MultiModalPlaceholderDict
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 6baf19d675d50..3ece0762e3228 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,6 +1,6 @@
+import pickle
 import re
 from abc import ABC, abstractmethod
-from collections import UserDict
 from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence
 from dataclasses import dataclass, field
 from functools import lru_cache
@@ -8,19 +8,18 @@ from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union
 
 import numpy as np
 import torch
+from blake3 import blake3
 from PIL.Image import Image
 from transformers import BatchFeature, ProcessorMixin
-from typing_extensions import assert_never
 
 from vllm.inputs import DummyData, InputProcessingContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import flatten_2d_lists, full_groupby, is_list_of
+from vllm.utils import LRUCache, flatten_2d_lists, full_groupby, is_list_of
 
-from .audio import resample_audio
-from .inputs import (AudioItem, ImageItem, MultiModalDataDict,
-                     MultiModalInputsV2, MultiModalKwargs, PlaceholderRange,
-                     VideoItem)
+from .inputs import (MultiModalDataDict, MultiModalDataItems,
+                     MultiModalFieldConfig, MultiModalFieldItem,
+                     MultiModalInputsV2, MultiModalKwargs, PlaceholderRange)
 
 logger = init_logger(__name__)
 
@@ -201,111 +200,6 @@ class _BoundPromptReplacement:
         return bound_replacement
 
 
-class ImageSize(NamedTuple):
-    width: int
-    height: int
-
-
-class MultiModalDataItems(UserDict[str, list[Any]]):
-    """
-    As :class:`MultiModalDataDict`, but normalized such that each entry
-    corresponds to a list.
-    """
-
-    @staticmethod
-    def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems":
-        """
-        Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
-        """
-        multi_data = MultiModalDataItems()
-
-        for k, v in data.items():
-            # TODO: Make a separate modality for embedding inputs
-            # to avoid confusion
-            # yapf: disable
-            if k == "video":
-                # Special case since even a single item can be a list
-                multi_data[k] = (  # type: ignore[index]
-                    v if (isinstance(v, torch.Tensor)
-                          or is_list_of(v, list)) else [v]
-                )
-            elif k in ("image", "audio"):
-                multi_data[k] = (  # type: ignore[index]
-                    v if isinstance(v, (torch.Tensor, list)) else [v]
-                )
-            else:
-                multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
-            # yapf: enable
-
-        return multi_data
-
-    # NOTE: When a field (e.g. `images`) doesn't exist, directly appending to
-    # `self.images` doesn't update this dictionary, which may be confusing
-    # We annotate the getter methods as `Sequence` to prevent others from
-    # trying to update the list in this way
-    @property
-    def images(self) -> Sequence[ImageItem]:
-        return self.get("image", [])
-
-    @property
-    def videos(self) -> Sequence[VideoItem]:
-        return self.get("video", [])
-
-    @property
-    def audios(self) -> Sequence[AudioItem]:
-        return self.get("audio", [])
-
-    def get_item_counts(self) -> Mapping[str, int]:
-        return {m: len(items) for m, items in self.items()}
-
-    def get_image_size(self, item_idx: int) -> ImageSize:
-        image = self.images[item_idx]
-
-        if isinstance(image, Image):
-            return ImageSize(*image.size)
-        if isinstance(image, (np.ndarray, torch.Tensor)):
-            _, h, w = image.shape
-            return ImageSize(w, h)
-
-        assert_never(image)
-
-    def get_audio_with_sr(
-        self,
-        item_idx: int,
-        *,
-        default_sr: float,
-    ) -> tuple[np.ndarray, float]:
-        audio = self.audios[item_idx]
-
-        if isinstance(audio, tuple):
-            return audio
-        if isinstance(audio, list):
-            return np.array(audio), default_sr
-        if isinstance(audio, np.ndarray):
-            return audio, default_sr
-
-        assert_never(audio)
-
-    def resample_audios(self, new_sr: float, *, drop_sr: bool = True) -> None:
-        """
-        If :code:`drop_sr=True`, the audio items in this dictionary are updated
-        to be NumPy arrays which implicitly means that their sampling rate is
-        the same as the model's expected sampling rate; otherwise, they remain
-        as :code:`(audio, new_sr)` tuples.
-        """
-        if not self.audios:
-            return
-
-        new_audios = []
-        for item_idx in range(len(self.audios)):
-            audio, sr = self.get_audio_with_sr(item_idx, default_sr=new_sr)
-            audio = resample_audio(audio, orig_sr=sr, target_sr=new_sr)
-
-            new_audios.append(audio if drop_sr else (audio, new_sr))
-
-        self["audio"] = new_audios
-
-
 class _TokenMatch(NamedTuple):
     start_idx: int
     end_idx: int
@@ -583,11 +477,124 @@ def iter_placeholders(
             )
 
 
-class ProcessorInputs(NamedTuple):
-    """Keyword arguments to :meth:`BaseMultiModalProcessor`"""
+@dataclass
+class ProcessorInputs:
+    """Keyword arguments to :meth:`BaseMultiModalProcessor`."""
     prompt_text: str
     mm_data: MultiModalDataDict
-    mm_processor_kwargs: Mapping[str, object]
+    hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
+
+
+class ProcessingCache:
+
+    def __init__(self, capacity: int) -> None:
+        super().__init__()
+
+        # DEBUG: Set to None to disable
+        self.debug_cache_hit_ratio_steps: Optional[int] = None
+
+        self._cache = LRUCache[str, Mapping[str,
+                                            MultiModalFieldItem]](capacity)
+
+    def _maybe_log_cache_stats(self) -> None:
+        steps = self.debug_cache_hit_ratio_steps
+        if not steps:
+            return
+
+        cache_stats = self._cache.stat()
+        if cache_stats.total % steps == 0:
+            logger.debug("ProcessingCache: hit_ratio = %.2f",
+                         cache_stats.hit_ratio)
+
+    def _serialize_item(self, obj: object) -> bytes:
+        # Simple cases
+        if isinstance(obj, str):
+            return obj.encode("utf-8")
+        if isinstance(obj, bytes):
+            return obj
+        if isinstance(obj, Image):
+            return obj.tobytes()
+
+        # Convertible to NumPy arrays
+        if isinstance(obj, torch.Tensor):
+            obj = obj.numpy()
+        if isinstance(obj, (int, float)):
+            obj = np.array(obj)
+        if isinstance(obj, np.ndarray):
+            return obj.tobytes()
+
+        logger.warning(
+            "No serialization method found for %s. "
+            "Falling back to pickle.", type(obj))
+
+        return pickle.dumps(obj)
+
+    def _item_to_bytes(
+        self,
+        key: str,
+        obj: object,
+    ) -> Iterable[tuple[bytes, bytes]]:
+        # Recursive cases
+        if isinstance(obj, (list, tuple)):
+            for i, elem in enumerate(obj):
+                yield from self._item_to_bytes(f"{key}.{i}", elem)
+        elif isinstance(obj, dict):
+            for k, v in obj.items():
+                yield from self._item_to_bytes(f"{key}.{k}", v)
+        else:
+            key_bytes = self._serialize_item(key)
+            value_bytes = self._serialize_item(obj)
+            yield key_bytes, value_bytes
+
+    def _hash_kwargs(self, **kwargs: object) -> str:
+        hasher = blake3()
+
+        for k, v in kwargs.items():
+            for k_bytes, v_bytes in self._item_to_bytes(k, v):
+                hasher.update(k_bytes)
+                hasher.update(v_bytes)
+
+        return hasher.hexdigest()
+
+    def get(
+        self,
+        model_id: str,
+        modality: str,
+        input_item: object,
+        input_kwargs: Mapping[str, object],
+    ) -> Optional[Mapping[str, MultiModalFieldItem]]:
+        """
+        Get a processed multi-modal item from the cache
+        according to its dependencies, including:
+
+        - The model ID
+        - The modality of the item
+        - The original data item passed to the HF processor
+        - The configuration options of the HF processor
+        """
+        self._maybe_log_cache_stats()
+
+        cache_key = self._hash_kwargs(model_id=model_id,
+                                      **{modality: input_item},
+                                      **input_kwargs)
+        return self._cache.get(cache_key)
+
+    def put(
+        self,
+        model_id: str,
+        modality: str,
+        input_item: object,
+        input_kwargs: Mapping[str, object],
+        output_kwargs: Mapping[str, MultiModalFieldItem],
+    ) -> None:
+        """
+        Put a processed multi-modal item into the cache
+        according to its dependencies (see :meth:`get`).
+        """
+        cache_key = self._hash_kwargs(model_id=model_id,
+                                      **{modality: input_item},
+                                      **input_kwargs)
+        self._cache.put(cache_key, output_kwargs)
 
 
 class BaseMultiModalProcessor(ABC):
@@ -595,18 +602,24 @@ class BaseMultiModalProcessor(ABC):
     Abstract base class to process multi-modal inputs to be used in vLLM.
     """
 
-    def __init__(self, ctx: InputProcessingContext) -> None:
+    def __init__(self,
+                 ctx: InputProcessingContext,
+                 *,
+                 cache: Optional[ProcessingCache] = None,
+                 enable_sanity_checks: bool = True) -> None:
         super().__init__()
 
         self.ctx = ctx
+        self.cache = cache
+        self.enable_sanity_checks = enable_sanity_checks
 
     def __call__(
         self,
         prompt: str,
         mm_data: MultiModalDataDict,
-        mm_processor_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
-        return self.apply(prompt, mm_data, mm_processor_kwargs)
+        return self.apply(prompt, mm_data, hf_processor_mm_kwargs)
 
     def _get_hf_processor(self) -> ProcessorMixin:
         """
@@ -624,12 +637,21 @@ class BaseMultiModalProcessor(ABC):
     ) -> MultiModalDataItems:
         return MultiModalDataItems.from_dict(mm_data)
 
+    @abstractmethod
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        """Given the HF-processed data, output the metadata of each field."""
+        raise NotImplementedError
+
     @abstractmethod
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
-        hf_inputs: BatchFeature,
-        mm_processor_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
         """
         Given the original multi-modal items for this modality
@@ -651,7 +673,7 @@ class BaseMultiModalProcessor(ABC):
         return list(
             iter_placeholders(all_prompt_repls, new_token_ids, mm_item_counts))
 
-    def _get_processor_data(
+    def _get_hf_mm_data(
         self,
         mm_items: MultiModalDataItems,
     ) -> tuple[dict[str, Any], dict[str, Any]]:
@@ -669,7 +691,7 @@ class BaseMultiModalProcessor(ABC):
                       and v[0].ndim == 2):
                     # Pass through embedding inputs (multi)
                     passthrough_data[f"{k}_embeds"] = v
-                else:
+                elif len(v) > 0:
                     # Map keys to plural form, e.g.: image -> images
                     processor_data[f"{k}s"] = v
             else:
@@ -679,39 +701,181 @@ class BaseMultiModalProcessor(ABC):
 
     def _call_hf_processor(
         self,
-        hf_processor: ProcessorMixin,
         prompt: str,
-        processor_data: Mapping[str, object],
-        mm_processor_kwargs: Mapping[str, object],
+        # Not to be confused with `mm_data` in `self.apply`.
+        # This refers to the data to be passed to HF processor.
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
         return self.ctx.call_hf_processor(
-            hf_processor,
-            prompt,
-            processor_data,
-            mm_processor_kwargs,
+            self._get_hf_processor(**mm_kwargs),
+            dict(text=prompt, **mm_data),
+            mm_kwargs,
         )
 
     def _apply_hf_processor(
         self,
-        prompt: str,
+        prompt_text: str,
         mm_items: MultiModalDataItems,
-        mm_processor_kwargs: Mapping[str, object],
-    ) -> BatchFeature:
-        # some mm_processor_kwargs may be used in processor initialization
-        # instead of processor call
-        hf_processor = self._get_hf_processor(**mm_processor_kwargs)
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> tuple[list[int], MultiModalKwargs]:
+        """
+        Apply the HF processor on the full prompt text and multi-modal data.
+        """
+        processor_data, passthrough_data = self._get_hf_mm_data(mm_items)
 
-        processor_data, passthrough_data = self._get_processor_data(mm_items)
-
-        hf_inputs = self._call_hf_processor(
-            hf_processor,
-            prompt=prompt,
-            processor_data=processor_data,
-            mm_processor_kwargs=mm_processor_kwargs,
+        processed_data = self._call_hf_processor(
+            prompt=prompt_text,
+            mm_data=processor_data,
+            mm_kwargs=hf_processor_mm_kwargs,
         )
-        hf_inputs.update(passthrough_data)
+        processed_data.update(passthrough_data)
 
-        return hf_inputs
+        prompt_ids, = processed_data.pop("input_ids").tolist()
+
+        mm_kwargs = MultiModalKwargs.from_hf_inputs(
+            processed_data,
+            self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs),
+            enable_sanity_checks=self.enable_sanity_checks,
+        )
+
+        return prompt_ids, mm_kwargs
+
+    def _apply_hf_processor_missing(
+        self,
+        prompt_text: str,
+        mm_missing_data_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ):
+        """
+        Apply the HF processor on the full prompt text, but only on the
+        multi-modal data that are missing from the cache.
+
+        Note: We pass prompt text and multi-modal data into the HF processor
+        in separate calls to avoid HF prompt replacement being done for
+        cached items; instead, we rely on our own prompt replacement logic
+        for the full text.
+        """
+        mm_missing_counts = mm_missing_data_items.get_item_counts()
+
+        prompt_ids, _ = self._apply_hf_processor(
+            prompt_text=prompt_text,
+            mm_items=MultiModalDataItems({}),
+            hf_processor_mm_kwargs={},
+        )
+
+        # Some HF processors (e.g. Qwen2-VL) expect corresponding
+        # multi-modal tokens to be in the prompt text
+        dummy_inputs = self._get_dummy_mm_inputs(mm_missing_counts)
+
+        _, mm_missing_kwargs = self._apply_hf_processor(
+            prompt_text=dummy_inputs.prompt_text,
+            mm_items=mm_missing_data_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
+        return prompt_ids, mm_missing_kwargs
+
+    def _cached_apply_hf_processor(
+        self,
+        prompt_text: str,
+        mm_data_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> tuple[list[int], MultiModalKwargs]:
+        """
+        Apply the HF processor on the full prompt text,
+        caching the results and reusing cached results.
+        """
+        cache = self.cache
+        model_id = self.ctx.model_config.model
+
+        if cache is None or mm_data_items.has_embedding_inputs():
+            return self._apply_hf_processor(
+                prompt_text=prompt_text,
+                mm_items=mm_data_items,
+                hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            )
+
+        mm_maybe_cached_field_items = {
+            modality: [
+                cache.get(model_id, modality, item, hf_processor_mm_kwargs)
+                for item in items
+            ]
+            for modality, items in mm_data_items.items()
+        }
+
+        mm_missing_idxs = {
+            modality: [idx for idx, out in enumerate(fields) if out is None]
+            for modality, fields in mm_maybe_cached_field_items.items()
+        }
+        mm_missing_data = {
+            modality: [mm_data_items[modality][idx] for idx in idxs]
+            for modality, idxs in mm_missing_idxs.items()
+        }
+        mm_missing_data_items = self._get_mm_items(mm_missing_data)
+
+        prompt_ids, mm_missing_kwargs = self._apply_hf_processor_missing(
+            prompt_text=prompt_text,
+            mm_missing_data_items=mm_missing_data_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
+        mm_missing_next_idx = {
+            modality: 0
+            for modality in mm_missing_data_items
+        }
+
+        mm_merged_field_items = dict[str, list[Mapping[str,
+                                                       MultiModalFieldItem]]]()
+        for modality, modal_items_lst in mm_maybe_cached_field_items.items():
+            merged_modal_items_lst = list[Mapping[str, MultiModalFieldItem]]()
+
+            for idx, modal_items in enumerate(modal_items_lst):
+                if modal_items is None:
+                    modal_items = mm_missing_kwargs.get_items_by_modality(
+                        modality,
+                        mm_missing_next_idx[modality],
+                    )
+
+                    cache.put(
+                        model_id,
+                        modality,
+                        mm_data_items[modality][idx],
+                        hf_processor_mm_kwargs,
+                        modal_items,
+                    )
+
+                    mm_missing_next_idx[modality] += 1
+
+                merged_modal_items_lst.append(modal_items)
+
+            mm_merged_field_items[modality] = merged_modal_items_lst
+
+        if self.enable_sanity_checks:
+            mm_missing_counts = mm_missing_data_items.get_item_counts()
+            assert all(
+                item_count == mm_missing_counts[modality]
+                for modality, item_count in mm_missing_next_idx.items()), dict(
+                    mm_missing_next_idx=mm_missing_next_idx,
+                    mm_missing_counts=mm_missing_counts)
+
+        mm_kwargs = MultiModalKwargs.from_items_by_modality(
+            mm_merged_field_items,
+            enable_sanity_checks=self.enable_sanity_checks,
+        )
+
+        if self.enable_sanity_checks:
+            mm_item_counts = mm_data_items.get_item_counts()
+
+            for modality, item_count in mm_item_counts.items():
+                for item_idx in range(item_count):
+                    try:
+                        mm_kwargs.get_items_by_modality(modality, item_idx)
+                    except Exception as e:
+                        # Make it easy to set a breakpoint in the debugger
+                        raise e
+
+        return prompt_ids, mm_kwargs
 
     def _bind_prompt_replacements(
         self,
@@ -730,6 +894,10 @@ class BaseMultiModalProcessor(ABC):
         tokenizer = self._get_tokenizer()
 
         token_matches = find_token_matches(token_ids, prompt_repls)
+        mm_match_counts = {
+            modality: len(matches)
+            for modality, matches in full_groupby_modality(token_matches)
+        }
 
         # If the search text does not represent a special token,
         # it may have different token IDs in the prompt, because
@@ -742,8 +910,8 @@ class BaseMultiModalProcessor(ABC):
         # of the search text in the prompt, we instead perform string
         # replacement on the decoded token IDs, then encode them back.
         if all(
-            len(matches) >= mm_item_counts[modality]
-            for modality, matches in full_groupby_modality(token_matches)
+            mm_match_counts.get(modality, 0) >= item_count
+            for modality, item_count in mm_item_counts.items()
         ):  # yapf: disable
             token_ids = replace_token_matches(
                 token_ids,
@@ -775,7 +943,7 @@ class BaseMultiModalProcessor(ABC):
         self,
         prompt_text: str,
         mm_data: MultiModalDataDict,
-        mm_processor_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -792,20 +960,24 @@ class BaseMultiModalProcessor(ABC):
         """
         mm_items = self._get_mm_items(mm_data)
 
-        hf_inputs = self._apply_hf_processor(prompt_text, mm_items,
-                                             mm_processor_kwargs)
-        prompt_ids, = hf_inputs.pop("input_ids").tolist()
-        mm_kwargs = MultiModalKwargs(hf_inputs)
+        prompt_ids, mm_kwargs = self._cached_apply_hf_processor(
+            prompt_text,
+            mm_items,
+            hf_processor_mm_kwargs,
+        )
 
-        prompt_repls = self._get_prompt_replacements(mm_items, hf_inputs,
-                                                     mm_processor_kwargs)
-        all_prompt_repls = self._bind_prompt_replacements(prompt_repls)
+        unbound_prompt_repls = self._get_prompt_replacements(
+            mm_items,
+            hf_processor_mm_kwargs,
+            mm_kwargs,
+        )
+        prompt_repls = self._bind_prompt_replacements(unbound_prompt_repls)
 
         # If HF processor already inserts placeholder tokens,
         # there is no need for us to insert them
         mm_item_counts = mm_items.get_item_counts()
-        all_placeholders = self._find_placeholders(all_prompt_repls,
-                                                   prompt_ids, mm_item_counts)
+        all_placeholders = self._find_placeholders(prompt_repls, prompt_ids,
+                                                   mm_item_counts)
 
         if all_placeholders:
             tokenizer = self._get_tokenizer()
@@ -817,7 +989,7 @@ class BaseMultiModalProcessor(ABC):
                 all_placeholders,
             ) = self._apply_prompt_replacements(
                 prompt_ids,
-                all_prompt_repls,
+                prompt_repls,
                 mm_item_counts,
             )
 
@@ -855,23 +1027,29 @@ class BaseMultiModalProcessor(ABC):
         from vllm.sequence import SequenceData
 
         processor_inputs = self._get_dummy_mm_inputs(mm_counts)
-        mm_inputs = self.apply(*processor_inputs)
+        mm_inputs = self.apply(
+            prompt_text=processor_inputs.prompt_text,
+            mm_data=processor_inputs.mm_data,
+            hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
+        )
 
         prompt_token_ids = mm_inputs["prompt_token_ids"]
         placeholders_by_modality = mm_inputs["mm_placeholders"]
 
-        total_placeholders_by_modality = dict[str, int]()
-        for modality, placeholders in placeholders_by_modality.items():
-            num_placeholders = sum(item["length"] for item in placeholders)
-            max_tokens = mm_max_tokens[modality]
-
-            if num_placeholders != max_tokens:
-                logger.warning(
-                    "The processed dummy data has a total of %d placeholder "
-                    "tokens for the '%s' modality, which is not the expected "
-                    "%d tokens.", num_placeholders, modality, max_tokens)
-
-            total_placeholders_by_modality[modality] = num_placeholders
+        total_placeholders_by_modality = {
+            modality: sum(item["length"] for item in placeholders)
+            for modality, placeholders in placeholders_by_modality.items()
+        }
+        expected_placeholders_by_modality = {
+            modality: mm_max_tokens[modality]
+            for modality in placeholders_by_modality
+        }
+        if total_placeholders_by_modality != expected_placeholders_by_modality:
+            raise AssertionError(
+                f"The processed dummy data has a total of "
+                f"{total_placeholders_by_modality} placeholder tokens, which "
+                f"is not the expected {expected_placeholders_by_modality} "
+                "tokens.")
 
         total_len = len(prompt_token_ids)
         if total_len > seq_len:
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index ded45a7184b5d..3a5e11867ad9e 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,10 +1,9 @@
 import functools
 from collections import UserDict
-from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, Optional,
+from typing import (TYPE_CHECKING, Any, Dict, Mapping, Optional, Protocol,
                     Sequence, Type, TypeVar)
 
 import torch.nn as nn
-from typing_extensions import TypeAlias
 
 from vllm.inputs import InputProcessingContext
 from vllm.logger import init_logger
@@ -15,7 +14,7 @@ from .audio import AudioPlugin
 from .base import MultiModalInputMapper, MultiModalPlugin, MultiModalTokensCalc
 from .image import ImagePlugin
 from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
-from .processing import BaseMultiModalProcessor
+from .processing import BaseMultiModalProcessor, ProcessingCache
 from .video import VideoPlugin
 
 if TYPE_CHECKING:
@@ -23,15 +22,22 @@ if TYPE_CHECKING:
 
 logger = init_logger(__name__)
 
+# TODO: Tune the MM cache size
+MM_CACHE_SIZE = 256
+
 N = TypeVar("N", bound=Type[nn.Module])
 
-MultiModalProcessorFactory: TypeAlias = Callable[[InputProcessingContext],
-                                                 BaseMultiModalProcessor]
-"""
-Constructs a :class:`MultiModalProcessor` instance from the context.
 
-The processing metadata should be derived from the context.
-"""
+class MultiModalProcessorFactory(Protocol):
+    """Constructs a :class:`MultiModalProcessor` instance from the context."""
+
+    def __call__(
+        self,
+        ctx: InputProcessingContext,
+        *,
+        cache: Optional[ProcessingCache] = None,
+    ) -> BaseMultiModalProcessor:
+        ...
 
 
 class _MultiModalLimits(UserDict["ModelConfig", Dict[str, int]]):
@@ -71,6 +77,8 @@ class MultiModalRegistry:
 
         self._limits_by_model = _MultiModalLimits()
 
+        self._processing_cache = ProcessingCache(MM_CACHE_SIZE)
+
     def register_plugin(self, plugin: MultiModalPlugin) -> None:
         """
         Register a multi-modal plugin so it can be recognized by vLLM.
@@ -328,15 +336,18 @@ class MultiModalRegistry:
 
         return wrapper
 
-    def has_processor(self, model_config: "ModelConfig") -> bool:
-        """
-        Test whether a multi-modal processor is defined for a specific model.
-        """
+    def _get_model_cls(self, model_config: "ModelConfig"):
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
 
         model_cls, _ = get_model_architecture(model_config)
-        return model_cls in self._processor_factories
+        return model_cls
+
+    def has_processor(self, model_config: "ModelConfig") -> bool:
+        """
+        Test whether a multi-modal processor is defined for a specific model.
+        """
+        return self._get_model_cls(model_config) in self._processor_factories
 
     def create_processor(
         self,
@@ -346,12 +357,11 @@ class MultiModalRegistry:
         """
         Create a multi-modal processor for a specific model and tokenizer.
         """
-
-        # Avoid circular import
-        from vllm.model_executor.model_loader import get_model_architecture
-
-        model_cls, _ = get_model_architecture(model_config)
+        model_cls = self._get_model_cls(model_config)
         processor_factory = self._processor_factories[model_cls]
 
         ctx = InputProcessingContext(model_config, tokenizer)
-        return processor_factory(ctx)
+        cache = (None if model_config.disable_mm_preprocessor_cache else
+                 self._processing_cache)
+
+        return processor_factory(ctx, cache=cache)
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index f1523667b0466..b12cc83a22970 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -1,25 +1,31 @@
 from functools import lru_cache
 from typing import Any, cast
 
+from transformers.processing_utils import ProcessorMixin
+
 
 def get_processor(
     processor_name: str,
     *args: Any,
     trust_remote_code: bool = False,
+    processor_cls: type[ProcessorMixin] = ProcessorMixin,
     **kwargs: Any,
 ):
     """Load a processor for the given model name via HuggingFace."""
     # don't put this import at the top level
     # it will call torch.cuda.device_count()
     from transformers import AutoProcessor
-    from transformers.processing_utils import ProcessorMixin
+
+    processor_factory = (AutoProcessor
+                         if processor_cls == ProcessorMixin else processor_cls)
 
     try:
-        processor = AutoProcessor.from_pretrained(
+        processor = processor_factory.from_pretrained(
             processor_name,
             *args,
             trust_remote_code=trust_remote_code,
-            **kwargs)
+            **kwargs,
+        )
     except ValueError as e:
         # If the error pertains to the processor class not existing or not
         # currently being imported, suggest using the --trust-remote-code flag.
diff --git a/vllm/utils.py b/vllm/utils.py
index 3d198887021dc..5eb4e8c4180c4 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -25,11 +25,11 @@ import warnings
 import weakref
 from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
 from collections import OrderedDict, UserDict, defaultdict
-from collections.abc import Iterable, Mapping
+from collections.abc import Hashable, Iterable, Mapping
 from dataclasses import dataclass, field
 from functools import lru_cache, partial, wraps
 from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
-                    Dict, Generator, Generic, Hashable, List, Literal,
+                    Dict, Generator, Generic, List, Literal, NamedTuple,
                     Optional, Tuple, Type, TypeVar, Union, overload)
 from uuid import uuid4
 
@@ -194,13 +194,29 @@ class Counter:
         self.counter = 0
 
 
+class CacheInfo(NamedTuple):
+    hits: int
+    total: int
+
+    @property
+    def hit_ratio(self) -> float:
+        if self.total == 0:
+            return 0
+
+        return self.hits / self.total
+
+
 class LRUCache(Generic[_K, _V]):
+    """Note: This class is not thread safe!"""
 
     def __init__(self, capacity: int) -> None:
         self.cache = OrderedDict[_K, _V]()
         self.pinned_items = set[_K]()
         self.capacity = capacity
 
+        self._hits = 0
+        self._total = 0
+
     def __contains__(self, key: _K) -> bool:
         return key in self.cache
 
@@ -218,6 +234,9 @@ class LRUCache(Generic[_K, _V]):
     def __delitem__(self, key: _K) -> None:
         self.pop(key)
 
+    def stat(self) -> CacheInfo:
+        return CacheInfo(hits=self._hits, total=self._total)
+
     def touch(self, key: _K) -> None:
         self.cache.move_to_end(key)
 
@@ -226,8 +245,12 @@ class LRUCache(Generic[_K, _V]):
         if key in self.cache:
             value = self.cache[key]
             self.cache.move_to_end(key)
+
+            self._hits += 1
         else:
             value = default
+
+        self._total += 1
         return value
 
     def put(self, key: _K, value: _V) -> None:

From 55509c2114718c1292c11348f002461ba44cb23b Mon Sep 17 00:00:00 2001
From: ErezSC42 <erezs@ai21.com>
Date: Fri, 27 Dec 2024 19:58:21 +0200
Subject: [PATCH 10/48] [MODEL] LoRA support for Jamba model (#11209)

Signed-off-by: Erez Schwartz <erezs@ai21.com>
---
 tests/lora/conftest.py                        | 24 +++++++++
 tests/lora/test_jamba.py                      | 54 +++++++++++++++++++
 .../layers/mamba/mamba_mixer.py               | 22 ++++++--
 vllm/model_executor/models/jamba.py           | 50 ++++++++---------
 vllm/model_executor/models/mamba.py           | 14 +++--
 5 files changed, 132 insertions(+), 32 deletions(-)
 create mode 100644 tests/lora/test_jamba.py

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 8b247fb9b2388..57ebaa424fc59 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -4,6 +4,7 @@ from typing import Dict, List, TypedDict
 from unittest.mock import MagicMock, patch
 
 import pytest
+import safetensors
 import torch
 import torch.nn as nn
 from huggingface_hub import snapshot_download
@@ -169,6 +170,29 @@ def mixtral_lora_files_all_target_modules():
     return snapshot_download(repo_id="dyang415/mixtral-lora-v0")
 
 
+@pytest.fixture(scope="session")
+def jamba_lora_files():
+    #   some of the adapters have unnecessary weights for serving,
+    #   hence we remove them
+    def remove_unnecessary_weights(path):
+        lora_path = f"{adapter_path}/adapter_model.safetensors"
+        tensors = safetensors.torch.load_file(lora_path)
+        nonlora_keys = []
+        for k in list(tensors.keys()):
+            if "lora" not in k:
+                nonlora_keys.append(k)
+        for k in nonlora_keys:
+            del tensors[k]
+        safetensors.torch.save_file(tensors, lora_path)
+
+    adapter_path = snapshot_download(
+        repo_id=
+        "hf-100/Jamba-1.5-mini-Spellbound-StoryWriter-0.1-6583896-ckpt53-lora")
+
+    remove_unnecessary_weights(adapter_path)
+    return adapter_path
+
+
 @pytest.fixture(scope="session")
 def gemma_lora_files():
     return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
diff --git a/tests/lora/test_jamba.py b/tests/lora/test_jamba.py
new file mode 100644
index 0000000000000..6aa33926cb6b8
--- /dev/null
+++ b/tests/lora/test_jamba.py
@@ -0,0 +1,54 @@
+from typing import List
+
+import pytest
+import torch
+
+import vllm
+from vllm.lora.request import LoRARequest
+
+MODEL_PATH = "ai21labs/AI21-Jamba-1.5-Mini"
+
+MAX_TOKENS = 40
+
+
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
+              prompts: List[str]) -> List[str]:
+
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=MAX_TOKENS)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts: List[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+@pytest.mark.parametrize("tp_size", [4])
+def test_jamba_lora(jamba_lora_files, tp_size):
+    """Original test, the LoRA model has the common target modules, not all"""
+    if torch.cuda.device_count() < tp_size:
+        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+
+    prompts = ["Write a story about a sheep and a goat."]
+
+    llm = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        distributed_executor_backend="ray",
+        tensor_parallel_size=tp_size,
+    )
+
+    expected_jamba_output = [
+        """Once upon a time, in a lush green meadow, there lived a sheep named Clara and a goat named Billy. Clara was a gentle creature, always nibbling on the soft grass and humming"""  # noqa: E501
+    ]
+    assert do_sample(llm, jamba_lora_files, lora_id=1,
+                     prompts=prompts) == expected_jamba_output
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index 10bec75f49fdf..606c796d503cf 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -42,12 +42,14 @@ class MambaMixer(CustomOp):
                  use_rms_norm: bool,
                  rms_norm_has_weight: bool = True,
                  rms_norm_eps: float = 1e-5,
-                 activation="silu"):
+                 activation="silu",
+                 is_lora_enabled: bool = False):
         super().__init__()
         self.time_step_rank = time_step_rank
         self.ssm_state_size = ssm_state_size
         self.use_rms_norm = use_rms_norm
         self.activation = activation
+        self.is_lora_enabled = is_lora_enabled
 
         self.conv1d = ColumnParallelLinear(
             input_size=conv_kernel_size,
@@ -63,6 +65,7 @@ class MambaMixer(CustomOp):
         self.in_proj = MergedColumnParallelLinear(hidden_size,
                                                   [intermediate_size] * 2,
                                                   bias=use_bias)
+
         # selective projection used to make dt, B and C input dependent
         self.x_proj = RowParallelLinear(
             intermediate_size,
@@ -170,7 +173,13 @@ class MambaMixer(CustomOp):
 
         # 3. State Space Model sequence transformation
         # 3.a. input varying initialization of time_step, B and C
-        ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0]
+
+        if self.is_lora_enabled:
+            #   lora kernel requires contiguous tensor
+            ssm_parameters = self.x_proj(
+                hidden_states.transpose(-2, -1).contiguous())[0]
+        else:
+            ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0]
 
         time_step, B, C = torch.split(
             ssm_parameters,
@@ -222,6 +231,11 @@ class MambaMixer(CustomOp):
             scan_outputs = scan_outputs.transpose(0, 1)
 
         # 4. Final linear projection
-        contextualized_states = self.out_proj(scan_outputs.transpose(-2,
-                                                                     -1))[0]
+        if self.is_lora_enabled:
+            #  lora kernel requires contiguous tensor
+            contextualized_states = self.out_proj(
+                scan_outputs.transpose(-2, -1).contiguous())[0]
+        else:
+            contextualized_states = self.out_proj(
+                scan_outputs.transpose(-2, -1))[0]
         return contextualized_states
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 91786db5ddc96..890b5530b97d6 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -107,9 +107,11 @@ class JambaMambaDecoderLayer(nn.Module):
                  layer_idx: int,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "") -> None:
+                 is_lora_enabled: Optional[bool] = False,
+                 **kwargs) -> None:
         super().__init__()
         self.config = config
+        self.is_lora_enabled = is_lora_enabled
         self.mamba = MambaMixer(hidden_size= config.hidden_size,
                                 ssm_state_size = config.mamba_d_state,
                                 conv_kernel_size = config.mamba_d_conv,
@@ -120,7 +122,9 @@ class JambaMambaDecoderLayer(nn.Module):
                                 use_bias = config.mamba_proj_bias,
                                 use_rms_norm=True,
                                 rms_norm_eps=config.rms_norm_eps,
-                                activation=config.hidden_act)
+                                activation=config.hidden_act,
+                                is_lora_enabled = self.is_lora_enabled
+                                )
 
         num_experts = config.layers_num_experts[layer_idx]
         ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP
@@ -156,14 +160,13 @@ class JambaMambaDecoderLayer(nn.Module):
 
 class JambaAttentionDecoderLayer(nn.Module):
 
-    def __init__(
-        self,
-        config: JambaConfig,
-        layer_idx: int,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self,
+                 config: JambaConfig,
+                 layer_idx: int,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "",
+                 **kwargs) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         tp_size = get_tensor_model_parallel_world_size()
@@ -287,17 +290,18 @@ class JambaModel(nn.Module):
             org_num_embeddings=config.vocab_size,
         )
 
+        extra_kwargs = {"is_lora_enabled": bool(vllm_config.lora_config)}
+
         def get_layer(prefix: str):
             layer_idx = int(prefix.rsplit(".", 1)[1])
             layer_class = ALL_DECODER_LAYER_TYPES[
                 config.layers_block_type[layer_idx]]
-            return layer_class(
-                config,
-                layer_idx,
-                cache_config,
-                quant_config=quant_config,
-                prefix=prefix,
-            )
+            return layer_class(config,
+                               layer_idx,
+                               cache_config,
+                               quant_config=quant_config,
+                               prefix=prefix,
+                               **extra_kwargs)
 
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers")
@@ -371,14 +375,13 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
             "k_proj",
             "v_proj",
         ],
+        "in_proj": ["in_proj"],
     }
 
     # LoRA specific attributes
     supported_lora_modules = [
-        "qkv_proj",
-        "o_proj",
-        "embed_tokens",
-        "lm_head",
+        "qkv_proj", "o_proj", "embed_tokens", "lm_head", "up_proj",
+        "down_proj", "gate_proj", "out_proj", "in_proj", "x_proj"
     ]
     embedding_modules = {
         "embed_tokens": "input_embeddings",
@@ -423,9 +426,9 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
         if self.scheduler_config is not None and \
-            not self.model_config.enforce_eager:
+                not self.model_config.enforce_eager:
             if self.scheduler_config.max_num_seqs > \
-                vllm_config.compilation_config.max_capture_size:
+                    vllm_config.compilation_config.max_capture_size:
                 self.max_batch_size = \
                     vllm_config.compilation_config.max_capture_size
             else:
@@ -446,7 +449,6 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
         if self.mamba_cache is None:
-
             num_mamba_layers = self.model_config.get_num_layers_by_block_type(
                 self.vllm_config.parallel_config, LayerBlockType.mamba)
             self.mamba_cache = MambaCacheManager(
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 06c8d9723cd01..553bc9c28cb21 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -38,10 +38,12 @@ class MambaDecoderLayer(nn.Module):
     def __init__(self,
                  config: MambaConfig,
                  cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None) -> None:
+                 quant_config: Optional[QuantizationConfig] = None,
+                 is_lora_enabled: Optional[bool] = False) -> None:
         super().__init__()
         self.config = config
         self.is_falcon_mamba = config.model_type == "falcon_mamba"
+        self.is_lora_enabled = is_lora_enabled
         mixer_rms_eps = config.mixer_rms_eps if self.is_falcon_mamba else None
         self.mixer = MambaMixer(hidden_size=config.hidden_size,
                                 ssm_state_size=config.state_size,
@@ -53,7 +55,8 @@ class MambaDecoderLayer(nn.Module):
                                 use_rms_norm=self.is_falcon_mamba,
                                 rms_norm_has_weight=not self.is_falcon_mamba,
                                 rms_norm_eps=mixer_rms_eps,
-                                activation=config.hidden_act)
+                                activation=config.hidden_act,
+                                is_lora_enabled=self.is_lora_enabled)
 
         self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
 
@@ -85,6 +88,7 @@ class MambaModel(nn.Module):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
+        is_lora_enabled = bool(lora_config)
 
         self.config = config
         self.padding_idx = config.pad_token_id
@@ -101,8 +105,10 @@ class MambaModel(nn.Module):
 
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: MambaDecoderLayer(
-                config, cache_config=cache_config, quant_config=quant_config),
+            lambda prefix: MambaDecoderLayer(config,
+                                             cache_config=cache_config,
+                                             quant_config=quant_config,
+                                             is_lora_enabled=is_lora_enabled),
             prefix=f"{prefix}.layers")
 
         self.norm_f = RMSNorm(config.hidden_size,

From 0240402c4632604c9cd02f7eae4ae36fa990b38f Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 28 Dec 2024 02:48:24 +0800
Subject: [PATCH 11/48] [Misc]Add BNB quantization for MolmoForCausalLM 
 (#11551)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/model_loader/loader.py | 26 +++++--
 vllm/model_executor/models/molmo.py        | 90 ++++++++++++++++------
 2 files changed, 83 insertions(+), 33 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index f2d9293b31a83..4bca13cb2f60c 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -11,7 +11,8 @@ import os
 import warnings
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
-from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, cast
+from typing import (Any, Callable, Dict, Generator, Iterable, List, Optional,
+                    Tuple, cast)
 
 import gguf
 import huggingface_hub
@@ -706,6 +707,8 @@ class BitsAndBytesModelLoader(BaseModelLoader):
         # Store all module names (from transformers) that support
         # BNB quantization.
         self.target_modules: List[str] = []
+        # mapping weight names from transformers to vllm.
+        self.weight_mapper: Callable = lambda name: name
 
     def _get_weight_files(
         self,
@@ -763,9 +766,12 @@ class BitsAndBytesModelLoader(BaseModelLoader):
 
     def _hf_weight_iter(self, hf_weights_files, use_safetensors: bool):
         if use_safetensors:
-            return safetensors_weights_iterator(hf_weights_files)
+            iterator = safetensors_weights_iterator(hf_weights_files)
         else:
-            return pt_weights_iterator(hf_weights_files)
+            iterator = pt_weights_iterator(hf_weights_files)
+        for name, param in iterator:
+            # mapping weight names from transformers to vllm.
+            yield self.weight_mapper(name), param
 
     def _get_quantized_weights_iterator(
         self,
@@ -782,12 +788,12 @@ class BitsAndBytesModelLoader(BaseModelLoader):
         try:
             import bitsandbytes
 
-            if bitsandbytes.__version__ < "0.44.0":
+            if bitsandbytes.__version__ < "0.45.0":
                 raise ImportError("bitsandbytes version is wrong. Please "
-                                  "install bitsandbytes>=0.44.0.")
+                                  "install bitsandbytes>=0.45.0.")
         except ImportError as err:
-            raise ImportError("Please install bitsandbytes>=0.44.0 via "
-                              "`pip install bitsandbytes>=0.44.0` to use "
+            raise ImportError("Please install bitsandbytes>=0.45.0 via "
+                              "`pip install bitsandbytes>=0.45.0` to use "
                               "bitsandbytes quantizer.") from err
 
         hf_weights_files, use_safetensors = self._prepare_weights(
@@ -991,7 +997,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
             if isinstance(module, (LinearBase, )):
                 last_name = name.split(".")[-1]
                 if sub_modules := inverse_stacked_mapping.get(last_name, []):
-                    # Map vllm's names to transformers' names.
+                    # Map vllm's names to transformers's names.
                     for sub_name in sub_modules:
                         self.target_modules.append(
                             name.replace(last_name, sub_name))
@@ -1013,6 +1019,10 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                 f"Model {type(model).__name__} does not support BitsAndBytes "
                 "quantization yet.")
 
+        # For some models like Molmo, we need to use hf_to_vllm_mapper
+        # to ensure correct loading of weights.
+        if hf_to_vllm_mapper := getattr(model, "hf_to_vllm_mapper", None):
+            self.weight_mapper = lambda name: hf_to_vllm_mapper._map_name(name)
         # Modules whose weights might have fused on disk
         # we need their output_sizes to make shard in flight correctly with TP
         self.maybe_fused_weights_modules: Dict[str, List[int]] = {}
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 8938f62d0c494..5d52d2c3e6b48 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -461,30 +461,71 @@ class MolmoAttention(nn.Module):
         return output
 
 
-class MolmoMLP(nn.Module):
+class SwiGLU(nn.Module):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, gate = x.chunk(2, dim=-1)
+        # Note that the order is reversed compared to
+        # SiluAndMul.
+        return x * F.silu(gate)
+
+
+class LanuageModelMLP(nn.Module):
     """Molmo's LLM mlp."""
 
     def __init__(self,
                  config: PretrainedConfig,
                  input_dim: Optional[int] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 proj_name: str = "gate_up_proj") -> None:
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size // 2
 
-        # Molmo's LLM proj weights are already merged into the disk, while
-        # image_projector proj is separate. If the same proj_name were used, it
-        # would create ambiguity and make it difficult to support BNB and LoRA.
-        self.proj_name = proj_name
-        setattr(
-            self, proj_name,
-            MergedColumnParallelLinear(
-                input_dim or self.hidden_size,
-                [self.intermediate_size] * 2,
-                bias=False,
-                quant_config=quant_config,
-            ))
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_dim or self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+        )
+        # Activation function.
+        self.act_fn = SwiGLU()
+        # Feed-forward output projection.
+        self.down_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class ImageProjectorMLP(nn.Module):
+    """Molmo's image_projector mlp."""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        input_dim: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size // 2
+
+        self.merged_linear = MergedColumnParallelLinear(
+            input_dim or self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+        )
         # Activation function.
         self.act_fn = SiluAndMul()
 
@@ -500,7 +541,7 @@ class MolmoMLP(nn.Module):
         self,
         x: torch.Tensor,
     ) -> torch.Tensor:
-        gate_up, _ = getattr(self, self.proj_name)(x)
+        gate_up, _ = self.merged_linear(x)
         x = self.act_fn(gate_up)
         x, _ = self.down_proj(x)
         return x
@@ -523,9 +564,7 @@ class MolmoDecoderLayer(nn.Module):
                                         prefix=f"{prefix}.self_attn")
 
         # MLP block.
-        self.mlp = MolmoMLP(config,
-                            quant_config=quant_config,
-                            proj_name="gate_up_proj")
+        self.mlp = LanuageModelMLP(config, quant_config=quant_config)
 
         # LayerNorm
         assert config.layer_norm_type == "rms"
@@ -617,11 +656,10 @@ class MolmoVisionBackbone(nn.Module):
             vision_config,
             nlayers=len(self.vit_layers),
             quant_config=quant_config)
-        self.image_projector = MolmoMLP(
+        self.image_projector = ImageProjectorMLP(
             config,
             input_dim=vision_config.image_emb_dim,
             quant_config=quant_config,
-            proj_name="merged_linear",
         )
 
         image_dim = vision_config.image_emb_dim * len(self.vit_layers)
@@ -842,10 +880,6 @@ class MolmoModel(nn.Module):
         loaded_params: Set[str] = set()
 
         for name, loaded_weight in weights:
-            if "gate_up_proj" in name:
-                up_proj, gate_proj = loaded_weight.chunk(2, dim=0)
-                loaded_weight = torch.cat([gate_proj, up_proj], dim=0)
-
             if name.endswith(".bias") and name not in params_dict:
                 continue
             if is_pp_missing_parameter(name, self):
@@ -1157,6 +1191,12 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         },
     )
 
+    # BitandBytes specific attributes
+    bitsandbytes_stacked_params_mapping = {
+        "gate_proj": ("merged_linear", 0),
+        "up_proj": ("merged_linear", 1),
+    }
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config

From dde1fa18c9f9ba992a8300a300543d6c18d5f08d Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 28 Dec 2024 03:45:13 +0800
Subject: [PATCH 12/48] [Misc] Improve BNB loader to handle mixture of sharded
 and merged weights with same suffix (#11566)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/model_loader/loader.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 4bca13cb2f60c..a9c1fa7221217 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1001,8 +1001,11 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                     for sub_name in sub_modules:
                         self.target_modules.append(
                             name.replace(last_name, sub_name))
-                else:
-                    self.target_modules.append(name)
+                # Add original module name even if the module has stacked map,
+                # in case model has a mixture of disk-merged and disk-splitted
+                # weights with same last name.
+                self.target_modules.append(name)
+
         assert (self.target_modules
                 ), "vllm currently does not support BNB quantization for"
         f" {type(model).__name__}"

From ac797994039ba9e6ed0c2b3a503099cb122a936e Mon Sep 17 00:00:00 2001
From: Selali <selali.adobor@gmail.com>
Date: Fri, 27 Dec 2024 12:12:11 -0800
Subject: [PATCH 13/48] [Bugfix] Fix for ROCM compressed tensor support
 (#11561)

---
 .../schemes/compressed_tensors_w8a8_fp8.py             | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 73cc8ce0d2a4b..1d4e4bd52adaa 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -41,10 +41,12 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
             )
 
             if current_platform.is_rocm():
+                input_scale = getattr(layer, 'input_scale', None)
+
                 weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
                     weight=weight,
                     weight_scale=max_w_scale,
-                    input_scale=layer.input_scale)
+                    input_scale=input_scale)
                 if input_scale is not None:
                     layer.input_scale = Parameter(input_scale,
                                                   requires_grad=False)
@@ -57,11 +59,13 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
             weight = layer.weight
 
             if current_platform.is_rocm():
+                input_scale = getattr(layer, 'input_scale', None)
+
                 weight, weight_scale, input_scale = \
                     normalize_e4m3fn_to_e4m3fnuz(
                         weight=weight,
                         weight_scale=layer.weight_scale,
-                        input_scale=layer.input_scale)
+                        input_scale=input_scale)
                 if input_scale is not None:
                     layer.input_scale = Parameter(input_scale,
                                                   requires_grad=False)
@@ -76,7 +80,7 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
             raise ValueError(f"Unknown quantization strategy {self.strategy}")
 
         # INPUT SCALE
-        if self.is_static_input_scheme:
+        if self.is_static_input_scheme and hasattr(layer, 'input_scale'):
             layer.input_scale = Parameter(layer.input_scale.max(),
                                           requires_grad=False)
         else:

From a60731247fba82fae5e71af7a19ea0df96de1caa Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Sat, 28 Dec 2024 08:31:10 +0800
Subject: [PATCH 14/48] [Doc] Update mllama example based on official doc
 (#11567)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 examples/offline_inference_vision_language.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index d5a71862656e7..77af914a6ef02 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -308,7 +308,20 @@ def run_mllama(question: str, modality: str):
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
-    prompt = f"<|image|><|begin_of_text|>{question}"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    messages = [{
+        "role":
+        "user",
+        "content": [{
+            "type": "image"
+        }, {
+            "type": "text",
+            "text": f"{question}"
+        }]
+    }]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           add_generation_prompt=True,
+                                           tokenize=False)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 

From df04dffade84c87cafd74de4c39e6fd7cb95c24f Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 27 Dec 2024 20:45:08 -0500
Subject: [PATCH 15/48] [V1] [4/N] API Server: ZMQ/MP Utilities (#11541)

---
 docs/requirements-docs.txt                 |   1 +
 tests/v1/engine/test_engine_core.py        |  13 +--
 tests/v1/engine/test_engine_core_client.py |  10 +-
 vllm/entrypoints/openai/api_server.py      |  11 +-
 vllm/executor/multiproc_worker_utils.py    |  22 +---
 vllm/utils.py                              |  90 ++++++++++++++++-
 vllm/v1/engine/async_llm.py                |   6 +-
 vllm/v1/engine/core.py                     | 111 ++++-----------------
 vllm/v1/engine/core_client.py              |  92 ++++++++---------
 vllm/v1/engine/llm_engine.py               |   6 +-
 vllm/v1/executor/multiproc_executor.py     |  11 +-
 vllm/v1/utils.py                           |  79 ++++++++++-----
 12 files changed, 242 insertions(+), 210 deletions(-)

diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index 4859c8ac08bea..25a700033cc9e 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -19,3 +19,4 @@ openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entr
 fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
 partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
 requests
+zmq
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index c529cd21f384b..954cec734b956 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -7,7 +7,6 @@ from transformers import AutoTokenizer
 from vllm import SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
-from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.core import EngineCore
@@ -43,13 +42,11 @@ def test_engine_core(monkeypatch):
         m.setenv("VLLM_USE_V1", "1")
         """Setup the EngineCore."""
         engine_args = EngineArgs(model=MODEL_NAME)
-        vllm_config = engine_args.create_engine_config(
-            usage_context=UsageContext.UNKNOWN_CONTEXT)
+        vllm_config = engine_args.create_engine_config()
         executor_class = AsyncLLM._get_executor_cls(vllm_config)
 
         engine_core = EngineCore(vllm_config=vllm_config,
-                                 executor_class=executor_class,
-                                 usage_context=UsageContext.UNKNOWN_CONTEXT)
+                                 executor_class=executor_class)
         """Test basic request lifecycle."""
 
         # First request.
@@ -151,13 +148,11 @@ def test_engine_core_advanced_sampling(monkeypatch):
         m.setenv("VLLM_USE_V1", "1")
         """Setup the EngineCore."""
         engine_args = EngineArgs(model=MODEL_NAME)
-        vllm_config = engine_args.create_engine_config(
-            usage_context=UsageContext.UNKNOWN_CONTEXT)
+        vllm_config = engine_args.create_engine_config()
         executor_class = AsyncLLM._get_executor_cls(vllm_config)
 
         engine_core = EngineCore(vllm_config=vllm_config,
-                                 executor_class=executor_class,
-                                 usage_context=UsageContext.UNKNOWN_CONTEXT)
+                                 executor_class=executor_class)
         """Test basic request lifecycle."""
         # First request.
         request: EngineCoreRequest = make_request()
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 2f1cbec607a91..729975e4ea8c4 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -86,11 +86,10 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
             UsageContext.UNKNOWN_CONTEXT)
         executor_class = AsyncLLM._get_executor_cls(vllm_config)
         client = EngineCoreClient.make_client(
-            vllm_config,
-            executor_class,
-            UsageContext.UNKNOWN_CONTEXT,
             multiprocess_mode=multiprocessing_mode,
             asyncio_mode=False,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
         )
 
         MAX_TOKENS = 20
@@ -158,11 +157,10 @@ async def test_engine_core_client_asyncio(monkeypatch):
             usage_context=UsageContext.UNKNOWN_CONTEXT)
         executor_class = AsyncLLM._get_executor_cls(vllm_config)
         client = EngineCoreClient.make_client(
-            vllm_config,
-            executor_class,
-            UsageContext.UNKNOWN_CONTEXT,
             multiprocess_mode=True,
             asyncio_mode=True,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
         )
 
         MAX_TOKENS = 20
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 2e45b474237f9..094cc15a317e9 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -68,7 +68,7 @@ from vllm.entrypoints.utils import with_cancellation
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
-                        is_valid_ipv6_address, set_ulimit)
+                        is_valid_ipv6_address, kill_process_tree, set_ulimit)
 from vllm.version import __version__ as VLLM_VERSION
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
@@ -737,6 +737,15 @@ async def run_server(args, **uvicorn_kwargs) -> None:
 
     signal.signal(signal.SIGTERM, signal_handler)
 
+    # The child processes will send SIGQUIT to this process when
+    # any error happens. This process then clean up the whole tree.
+    # TODO(rob): move this into AsyncLLM.__init__ once we remove
+    # the context manager below.
+    def sigquit_handler(signum, frame):
+        kill_process_tree(os.getpid())
+
+    signal.signal(signal.SIGQUIT, sigquit_handler)
+
     async with build_async_engine_client(args) as engine_client:
         app = build_app(args)
 
diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
index c4d90f0856f86..bc32826529eef 100644
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -1,5 +1,4 @@
 import asyncio
-import multiprocessing
 import os
 import sys
 import threading
@@ -13,10 +12,9 @@ from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO,
 
 import torch
 
-import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.triton_utils.importing import HAS_TRITON
-from vllm.utils import cuda_is_initialized
+from vllm.utils import _check_multiproc_method, get_mp_context
 
 if HAS_TRITON:
     from vllm.triton_utils import maybe_set_triton_cache_manager
@@ -274,24 +272,6 @@ def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None:
     file.write = write_with_prefix  # type: ignore[method-assign]
 
 
-def _check_multiproc_method():
-    if (cuda_is_initialized()
-            and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
-        logger.warning("CUDA was previously initialized. We must use "
-                       "the `spawn` multiprocessing start method. Setting "
-                       "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
-                       "See https://docs.vllm.ai/en/latest/getting_started/"
-                       "debugging.html#python-multiprocessing "
-                       "for more information.")
-        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-
-
-def get_mp_context():
-    _check_multiproc_method()
-    mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
-    return multiprocessing.get_context(mp_method)
-
-
 def set_multiprocessing_worker_envs(parallel_config):
     """ Set up environment variables that should be used when there are workers
     in a multiprocessing environment. This should be called by the parent 
diff --git a/vllm/utils.py b/vllm/utils.py
index 5eb4e8c4180c4..2b46c1fef0d09 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -10,6 +10,7 @@ import importlib.metadata
 import importlib.util
 import inspect
 import ipaddress
+import multiprocessing
 import os
 import re
 import resource
@@ -20,6 +21,7 @@ import sys
 import tempfile
 import threading
 import time
+import traceback
 import uuid
 import warnings
 import weakref
@@ -29,8 +31,9 @@ from collections.abc import Hashable, Iterable, Mapping
 from dataclasses import dataclass, field
 from functools import lru_cache, partial, wraps
 from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
-                    Dict, Generator, Generic, List, Literal, NamedTuple,
-                    Optional, Tuple, Type, TypeVar, Union, overload)
+                    Dict, Generator, Generic, Iterator, List, Literal,
+                    NamedTuple, Optional, Tuple, Type, TypeVar, Union,
+                    overload)
 from uuid import uuid4
 
 import numpy as np
@@ -39,6 +42,8 @@ import psutil
 import torch
 import torch.types
 import yaml
+import zmq
+import zmq.asyncio
 from packaging.version import Version
 from torch.library import Library
 from typing_extensions import ParamSpec, TypeIs, assert_never
@@ -1844,7 +1849,7 @@ def memory_profiling(
     result.non_kv_cache_memory_in_bytes = result.non_torch_increase_in_bytes + result.torch_peak_increase_in_bytes + result.weights_memory_in_bytes  # noqa
 
 
-# Adapted from: https://github.com/sgl-project/sglang/blob/f46f394f4d4dbe4aae85403dec006199b34d2840/python/sglang/srt/utils.py#L630 # noqa: E501Curre
+# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501
 def set_ulimit(target_soft_limit=65535):
     resource_type = resource.RLIMIT_NOFILE
     current_soft, current_hard = resource.getrlimit(resource_type)
@@ -1859,3 +1864,82 @@ def set_ulimit(target_soft_limit=65535):
                 "with error %s. This can cause fd limit errors like"
                 "`OSError: [Errno 24] Too many open files`. Consider "
                 "increasing with ulimit -n", current_soft, e)
+
+
+# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/utils.py#L28 # noqa: E501
+def get_exception_traceback():
+    etype, value, tb = sys.exc_info()
+    err_str = "".join(traceback.format_exception(etype, value, tb))
+    return err_str
+
+
+# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L783 # noqa: E501
+def make_zmq_socket(
+    ctx: Union[zmq.asyncio.Context, zmq.Context],  # type: ignore[name-defined]
+    path: str,
+    type: Any,
+) -> Union[zmq.Socket, zmq.asyncio.Socket]:  # type: ignore[name-defined]
+    """Make a ZMQ socket with the proper bind/connect semantics."""
+
+    mem = psutil.virtual_memory()
+    socket = ctx.socket(type)
+
+    # Calculate buffer size based on system memory
+    total_mem = mem.total / 1024**3
+    available_mem = mem.available / 1024**3
+    # For systems with substantial memory (>32GB total, >16GB available):
+    # - Set a large 0.5GB buffer to improve throughput
+    # For systems with less memory:
+    # - Use system default (-1) to avoid excessive memory consumption
+    if total_mem > 32 and available_mem > 16:
+        buf_size = int(0.5 * 1024**3)  # 0.5GB in bytes
+    else:
+        buf_size = -1  # Use system default buffer size
+
+    if type == zmq.constants.PULL:
+        socket.setsockopt(zmq.constants.RCVHWM, 0)
+        socket.setsockopt(zmq.constants.RCVBUF, buf_size)
+        socket.connect(path)
+    elif type == zmq.constants.PUSH:
+        socket.setsockopt(zmq.constants.SNDHWM, 0)
+        socket.setsockopt(zmq.constants.SNDBUF, buf_size)
+        socket.bind(path)
+    else:
+        raise ValueError(f"Unknown Socket Type: {type}")
+
+    return socket
+
+
+@contextlib.contextmanager
+def zmq_socket_ctx(
+        path: str,
+        type: Any) -> Iterator[zmq.Socket]:  # type: ignore[name-defined]
+    """Context manager for a ZMQ socket"""
+
+    ctx = zmq.Context(io_threads=2)  # type: ignore[attr-defined]
+    try:
+        yield make_zmq_socket(ctx, path, type)
+
+    except KeyboardInterrupt:
+        logger.debug("Got Keyboard Interrupt.")
+
+    finally:
+        ctx.destroy(linger=0)
+
+
+def _check_multiproc_method():
+    if (cuda_is_initialized()
+            and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
+        logger.warning("CUDA was previously initialized. We must use "
+                       "the `spawn` multiprocessing start method. Setting "
+                       "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
+                       "See https://docs.vllm.ai/en/latest/getting_started/"
+                       "debugging.html#python-multiprocessing "
+                       "for more information.")
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+
+def get_mp_context():
+    _check_multiproc_method()
+    mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
+    return multiprocessing.get_context(mp_method)
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index ba2b8377759d6..da3da6dad6436 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -75,11 +75,11 @@ class AsyncLLM(EngineClient):
 
         # EngineCore (starts the engine in background process).
         self.engine_core = EngineCoreClient.make_client(
-            vllm_config=vllm_config,
-            executor_class=executor_class,
-            usage_context=usage_context,
             multiprocess_mode=True,
             asyncio_mode=True,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=self.log_stats,
         )
 
         self.output_handler: Optional[asyncio.Task] = None
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 0aef61fc7f680..5840541d774ba 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -3,20 +3,19 @@ import queue
 import signal
 import threading
 import time
-from dataclasses import dataclass
-from multiprocessing.process import BaseProcess
+from multiprocessing.connection import Connection
 from typing import List, Tuple, Type
 
+import psutil
 import zmq
 import zmq.asyncio
 from msgspec import msgpack
 
 from vllm.config import CacheConfig, VllmConfig
-from vllm.executor.multiproc_worker_utils import get_mp_context
 from vllm.logger import init_logger
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
-from vllm.usage.usage_lib import UsageContext
+from vllm.utils import get_exception_traceback, zmq_socket_ctx
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
@@ -25,14 +24,13 @@ from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import PickleEncoder
-from vllm.v1.utils import make_zmq_socket
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
 
 POLLING_TIMEOUT_MS = 5000
 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
-LOGGING_TIME_S = POLLING_TIMEOUT_S
+LOGGING_TIME_S = 5
 
 
 class EngineCore:
@@ -42,9 +40,10 @@ class EngineCore:
         self,
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
-        usage_context: UsageContext,
+        log_stats: bool = False,
     ):
         assert vllm_config.model_config.runner_type != "pooling"
+        self.log_stats = log_stats
 
         logger.info("Initializing an LLM engine (v%s) with config: %s",
                     VLLM_VERSION, vllm_config)
@@ -134,29 +133,19 @@ class EngineCore:
         self.model_executor.profile(is_start)
 
 
-@dataclass
-class EngineCoreProcHandle:
-    proc: BaseProcess
-    ready_path: str
-    input_path: str
-    output_path: str
-
-
 class EngineCoreProc(EngineCore):
     """ZMQ-wrapper for running EngineCore in background process."""
 
-    READY_STR = "READY"
-
     def __init__(
         self,
-        vllm_config: VllmConfig,
-        executor_class: Type[Executor],
-        usage_context: UsageContext,
         input_path: str,
         output_path: str,
-        ready_path: str,
+        ready_pipe: Connection,
+        vllm_config: VllmConfig,
+        executor_class: Type[Executor],
+        log_stats: bool = False,
     ):
-        super().__init__(vllm_config, executor_class, usage_context)
+        super().__init__(vllm_config, executor_class, log_stats)
 
         # Background Threads and Queues for IO. These enable us to
         # overlap ZMQ socket IO with GPU since they release the GIL,
@@ -173,68 +162,7 @@ class EngineCoreProc(EngineCore):
                          daemon=True).start()
 
         # Send Readiness signal to EngineClient.
-        with make_zmq_socket(ready_path, zmq.constants.PUSH) as ready_socket:
-            ready_socket.send_string(EngineCoreProc.READY_STR)
-
-    @staticmethod
-    def wait_for_startup(
-        proc: BaseProcess,
-        ready_path: str,
-    ) -> None:
-        """Wait until the EngineCore is ready."""
-
-        try:
-            sync_ctx = zmq.Context()  # type: ignore[attr-defined]
-            socket = sync_ctx.socket(zmq.constants.PULL)
-            socket.connect(ready_path)
-
-            # Wait for EngineCore to send EngineCoreProc.READY_STR.
-            while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
-                logger.debug("Waiting for EngineCoreProc to startup.")
-
-                if not proc.is_alive():
-                    raise RuntimeError("EngineCoreProc failed to start.")
-
-            message = socket.recv_string()
-            assert message == EngineCoreProc.READY_STR
-
-        except BaseException as e:
-            logger.exception(e)
-            raise e
-
-        finally:
-            sync_ctx.destroy(linger=0)
-
-    @staticmethod
-    def make_engine_core_process(
-        vllm_config: VllmConfig,
-        executor_class: Type[Executor],
-        usage_context: UsageContext,
-        input_path: str,
-        output_path: str,
-        ready_path: str,
-    ) -> EngineCoreProcHandle:
-        context = get_mp_context()
-
-        process_kwargs = {
-            "input_path": input_path,
-            "output_path": output_path,
-            "ready_path": ready_path,
-            "vllm_config": vllm_config,
-            "executor_class": executor_class,
-            "usage_context": usage_context,
-        }
-        # Run EngineCore busy loop in background process.
-        proc = context.Process(target=EngineCoreProc.run_engine_core,
-                               kwargs=process_kwargs)
-        proc.start()
-
-        # Wait for startup
-        EngineCoreProc.wait_for_startup(proc, ready_path)
-        return EngineCoreProcHandle(proc=proc,
-                                    ready_path=ready_path,
-                                    input_path=input_path,
-                                    output_path=output_path)
+        ready_pipe.send({"status": "READY"})
 
     @staticmethod
     def run_engine_core(*args, **kwargs):
@@ -258,6 +186,7 @@ class EngineCoreProc(EngineCore):
         signal.signal(signal.SIGTERM, signal_handler)
         signal.signal(signal.SIGINT, signal_handler)
 
+        parent_process = psutil.Process().parent()
         engine_core = None
         try:
             engine_core = EngineCoreProc(*args, **kwargs)
@@ -266,9 +195,10 @@ class EngineCoreProc(EngineCore):
         except SystemExit:
             logger.debug("EngineCore interrupted.")
 
-        except BaseException as e:
-            logger.exception(e)
-            raise e
+        except Exception:
+            traceback = get_exception_traceback()
+            logger.error("EngineCore hit an exception: %s", traceback)
+            parent_process.send_signal(signal.SIGQUIT)
 
         finally:
             if engine_core is not None:
@@ -309,6 +239,9 @@ class EngineCoreProc(EngineCore):
     def _log_stats(self):
         """Log basic stats every LOGGING_TIME_S"""
 
+        if not self.log_stats:
+            return
+
         now = time.time()
 
         if now - self._last_logging_time > LOGGING_TIME_S:
@@ -339,7 +272,7 @@ class EngineCoreProc(EngineCore):
         decoder_add_req = PickleEncoder()
         decoder_abort_req = PickleEncoder()
 
-        with make_zmq_socket(input_path, zmq.constants.PULL) as socket:
+        with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket:
             while True:
                 # (RequestType, RequestData)
                 type_frame, data_frame = socket.recv_multipart(copy=False)
@@ -367,7 +300,7 @@ class EngineCoreProc(EngineCore):
         # Reuse send buffer.
         buffer = bytearray()
 
-        with make_zmq_socket(output_path, zmq.constants.PUSH) as socket:
+        with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket:
             while True:
                 engine_core_outputs = self.output_queue.get()
                 outputs = EngineCoreOutputs(outputs=engine_core_outputs)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index d56fcbdb1e7c4..beb5d57c20c83 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,19 +1,19 @@
-import os
-import weakref
-from typing import List, Optional
+from typing import List, Optional, Type
 
 import msgspec
 import zmq
 import zmq.asyncio
 
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import get_open_zmq_ipc_path, kill_process_tree
+from vllm.utils import get_open_zmq_ipc_path
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
                             EngineCoreRequestType, EngineCoreRequestUnion)
-from vllm.v1.engine.core import (EngineCore, EngineCoreProc,
-                                 EngineCoreProcHandle)
+from vllm.v1.engine.core import EngineCore, EngineCoreProc
+from vllm.v1.executor.abstract import Executor
 from vllm.v1.serial_utils import PickleEncoder
+from vllm.v1.utils import BackgroundProcHandle
 
 logger = init_logger(__name__)
 
@@ -31,10 +31,11 @@ class EngineCoreClient:
 
     @staticmethod
     def make_client(
-        *args,
         multiprocess_mode: bool,
         asyncio_mode: bool,
-        **kwargs,
+        vllm_config: VllmConfig,
+        executor_class: Type[Executor],
+        log_stats: bool = False,
     ) -> "EngineCoreClient":
 
         # TODO: support this for debugging purposes.
@@ -44,12 +45,12 @@ class EngineCoreClient:
                 "is not currently supported.")
 
         if multiprocess_mode and asyncio_mode:
-            return AsyncMPClient(*args, **kwargs)
+            return AsyncMPClient(vllm_config, executor_class, log_stats)
 
         if multiprocess_mode and not asyncio_mode:
-            return SyncMPClient(*args, **kwargs)
+            return SyncMPClient(vllm_config, executor_class, log_stats)
 
-        return InprocClient(*args, **kwargs)
+        return InprocClient(vllm_config, executor_class, log_stats)
 
     def shutdown(self):
         pass
@@ -128,9 +129,10 @@ class MPClient(EngineCoreClient):
 
     def __init__(
         self,
-        *args,
         asyncio_mode: bool,
-        **kwargs,
+        vllm_config: VllmConfig,
+        executor_class: Type[Executor],
+        log_stats: bool = False,
     ):
         # Serialization setup.
         self.encoder = PickleEncoder()
@@ -143,7 +145,6 @@ class MPClient(EngineCoreClient):
             self.ctx = zmq.Context()  # type: ignore[attr-defined]
 
         # Path for IPC.
-        ready_path = get_open_zmq_ipc_path()
         output_path = get_open_zmq_ipc_path()
         input_path = get_open_zmq_ipc_path()
 
@@ -156,47 +157,40 @@ class MPClient(EngineCoreClient):
         self.input_socket.bind(input_path)
 
         # Start EngineCore in background process.
-        self.proc_handle: Optional[EngineCoreProcHandle]
-        self.proc_handle = EngineCoreProc.make_engine_core_process(
-            *args,
-            input_path=
-            input_path,  # type: ignore[misc]  # MyPy incorrectly flags duplicate keywords
-            output_path=output_path,  # type: ignore[misc]
-            ready_path=ready_path,  # type: ignore[misc]
-            **kwargs,
-        )
-        self._finalizer = weakref.finalize(self, self.shutdown)
+        self.proc_handle: Optional[BackgroundProcHandle]
+        self.proc_handle = BackgroundProcHandle(
+            input_path=input_path,
+            output_path=output_path,
+            process_name="EngineCore",
+            target_fn=EngineCoreProc.run_engine_core,
+            process_kwargs={
+                "vllm_config": vllm_config,
+                "executor_class": executor_class,
+                "log_stats": log_stats,
+            })
 
     def shutdown(self):
         # Shut down the zmq context.
         self.ctx.destroy(linger=0)
 
         if hasattr(self, "proc_handle") and self.proc_handle:
-            # Shutdown the process if needed.
-            if self.proc_handle.proc.is_alive():
-                self.proc_handle.proc.terminate()
-                self.proc_handle.proc.join(5)
-
-                if self.proc_handle.proc.is_alive():
-                    kill_process_tree(self.proc_handle.proc.pid)
-
-            # Remove zmq ipc socket files
-            ipc_sockets = [
-                self.proc_handle.ready_path, self.proc_handle.output_path,
-                self.proc_handle.input_path
-            ]
-            for ipc_socket in ipc_sockets:
-                socket_file = ipc_socket.replace("ipc://", "")
-                if os and os.path.exists(socket_file):
-                    os.remove(socket_file)
+            self.proc_handle.shutdown()
             self.proc_handle = None
 
 
 class SyncMPClient(MPClient):
     """Synchronous client for multi-proc EngineCore."""
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, asyncio_mode=False, **kwargs)
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 executor_class: Type[Executor],
+                 log_stats: bool = False):
+        super().__init__(
+            asyncio_mode=False,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=log_stats,
+        )
 
     def get_output(self) -> List[EngineCoreOutput]:
 
@@ -225,8 +219,16 @@ class SyncMPClient(MPClient):
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, asyncio_mode=True, **kwargs)
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 executor_class: Type[Executor],
+                 log_stats: bool = False):
+        super().__init__(
+            asyncio_mode=True,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=log_stats,
+        )
 
     async def get_output_async(self) -> List[EngineCoreOutput]:
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index b58f62778ffe9..fc323184abc8f 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -72,11 +72,11 @@ class LLMEngine:
 
         # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
         self.engine_core = EngineCoreClient.make_client(
-            vllm_config,
-            executor_class,
-            usage_context,
             multiprocess_mode=multiprocess_mode,
             asyncio_mode=False,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=False,
         )
 
     @classmethod
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 128101aa6956d..ed64e7741390d 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -17,13 +17,12 @@ from vllm.distributed import (destroy_distributed_environment,
 from vllm.distributed.device_communicators.shm_broadcast import (Handle,
                                                                  MessageQueue)
 from vllm.executor.multiproc_worker_utils import (
-    _add_prefix, get_mp_context, set_multiprocessing_worker_envs)
+    _add_prefix, set_multiprocessing_worker_envs)
 from vllm.logger import init_logger
-from vllm.utils import (get_distributed_init_method, get_open_port,
-                        get_open_zmq_ipc_path)
+from vllm.utils import (get_distributed_init_method, get_mp_context,
+                        get_open_port, get_open_zmq_ipc_path, zmq_socket_ctx)
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.outputs import ModelRunnerOutput
-from vllm.v1.utils import make_zmq_socket
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -250,7 +249,7 @@ class WorkerProc:
         worker_response_mq_handle = self.worker_response_mq.export_handle()
 
         # Send Readiness signal to EngineCore process.
-        with make_zmq_socket(ready_path, zmq.constants.PUSH) as ready_socket:
+        with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket:
             payload = pickle.dumps(worker_response_mq_handle,
                                    protocol=pickle.HIGHEST_PROTOCOL)
             ready_socket.send_string(WorkerProc.READY_STR)
@@ -352,7 +351,7 @@ class WorkerProc:
         ready_path: str,
     ) -> Optional[Handle]:
         """Wait until the Worker is ready."""
-        with make_zmq_socket(ready_path, zmq.constants.PULL) as socket:
+        with zmq_socket_ctx(ready_path, zmq.constants.PULL) as socket:
 
             # Wait for Worker to send READY.
             while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index e802c6439b740..19e0dd17237c9 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,11 +1,11 @@
+import os
+import weakref
 from collections.abc import Sequence
-from contextlib import contextmanager
-from typing import (Any, Generic, Iterator, List, Optional, TypeVar, Union,
-                    overload)
-
-import zmq
+from typing import (Any, Callable, Dict, Generic, List, Optional, TypeVar,
+                    Union, overload)
 
 from vllm.logger import init_logger
+from vllm.utils import get_mp_context, kill_process_tree
 
 logger = init_logger(__name__)
 
@@ -77,27 +77,58 @@ class ConstantList(Generic[T], Sequence):
         return len(self._x)
 
 
-@contextmanager
-def make_zmq_socket(
-        path: str,
-        type: Any) -> Iterator[zmq.Socket]:  # type: ignore[name-defined]
-    """Context manager for a ZMQ socket"""
+class BackgroundProcHandle:
+    """
+    Utility class to handle creation, readiness, and shutdown
+    of background processes used by the AsyncLLM and LLMEngine.
+    """
 
-    ctx = zmq.Context()  # type: ignore[attr-defined]
-    try:
-        socket = ctx.socket(type)
+    def __init__(
+        self,
+        input_path: str,
+        output_path: str,
+        process_name: str,
+        target_fn: Callable,
+        process_kwargs: Dict[Any, Any],
+    ):
+        self._finalizer = weakref.finalize(self, self.shutdown)
 
-        if type == zmq.constants.PULL:
-            socket.connect(path)
-        elif type == zmq.constants.PUSH:
-            socket.bind(path)
-        else:
-            raise ValueError(f"Unknown Socket Type: {type}")
+        context = get_mp_context()
+        reader, writer = context.Pipe(duplex=False)
 
-        yield socket
+        assert ("ready_pipe" not in process_kwargs
+                and "input_path" not in process_kwargs
+                and "output_path" not in process_kwargs)
+        process_kwargs["ready_pipe"] = writer
+        process_kwargs["input_path"] = input_path
+        process_kwargs["output_path"] = output_path
+        self.input_path = input_path
+        self.output_path = output_path
 
-    except KeyboardInterrupt:
-        logger.debug("Worker had Keyboard Interrupt.")
+        # Run Detokenizer busy loop in background process.
+        self.proc = context.Process(target=target_fn, kwargs=process_kwargs)
+        self.proc.start()
 
-    finally:
-        ctx.destroy(linger=0)
+        # Wait for startup.
+        if reader.recv()["status"] != "READY":
+            raise RuntimeError(f"{process_name} initialization failed. "
+                               "See root cause above.")
+
+    def __del__(self):
+        self.shutdown()
+
+    def shutdown(self):
+        # Shutdown the process if needed.
+        if hasattr(self, "proc") and self.proc.is_alive():
+            self.proc.terminate()
+            self.proc.join(5)
+
+            if self.proc.is_alive():
+                kill_process_tree(self.proc.pid)
+
+        # Remove zmq ipc socket files
+        ipc_sockets = [self.output_path, self.input_path]
+        for ipc_socket in ipc_sockets:
+            socket_file = ipc_socket.replace("ipc://", "")
+            if os and os.path.exists(socket_file):
+                os.remove(socket_file)

From b5cbe8eeb30e86c8477d91c66f5c7a10e4ee754b Mon Sep 17 00:00:00 2001
From: Rajveer Bachkaniwala <46040700+rajveerb@users.noreply.github.com>
Date: Fri, 27 Dec 2024 22:34:46 -0500
Subject: [PATCH 16/48] [Bugfix] Last token measurement fix (#11376)

Signed-off-by: rajveerb <46040700+rajveerb@users.noreply.github.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
---
 vllm/engine/llm_engine.py |  8 ++++++--
 vllm/sequence.py          | 24 ++++++++++++++----------
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 39f59e55da1f7..1db3e59ff3bae 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1124,6 +1124,8 @@ class LLMEngine:
 
             seq_group = scheduled_seq_group.seq_group
             seq_group.maybe_set_first_token_time(now)
+            if not seq_group.is_prefill():
+                seq_group.set_last_token_time(now)
             request_output = RequestOutputFactory.create(
                 seq_group,
                 self.seq_id_to_seq_group,
@@ -1166,6 +1168,8 @@ class LLMEngine:
 
             seq_group = scheduled_seq_group.seq_group
             seq_group.maybe_set_first_token_time(now)
+            if not seq_group.is_prefill():
+                seq_group.set_last_token_time(now)
             request_output = RequestOutputFactory.create(
                 seq_group,
                 self.seq_id_to_seq_group,
@@ -1686,7 +1690,7 @@ class LLMEngine:
                     # If the seq_group just finished the prefill state
                     # get TTFT.
                     if not seq_group.is_prefill():
-                        latency = seq_group.get_last_latency(now)
+                        latency = seq_group.get_last_token_latency()
                         time_to_first_tokens_iter.append(latency)
 
                         # One generation token per finished prefill.
@@ -1694,7 +1698,7 @@ class LLMEngine:
                             seq_group.num_seqs())
                 else:
                     # TPOTs.
-                    latency = seq_group.get_last_latency(now)
+                    latency = seq_group.get_last_token_latency()
                     time_per_output_tokens_iter.append(latency)
                     if seq_group.state.current_step == 0:
                         # For async_output_proc, the do_log_stats()
diff --git a/vllm/sequence.py b/vllm/sequence.py
index cc3d96fc93a79..34f910d47b7d9 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -667,6 +667,7 @@ class SequenceGroup:
                                       first_scheduled_time=None,
                                       first_token_time=None,
                                       time_in_queue=None)
+        self.last_token_latency = 0.0
         self.lora_request = lora_request
         self.prompt_logprobs: Optional[PromptLogprobs] = None
         self.state = SequenceGroupState()
@@ -762,18 +763,21 @@ class SequenceGroup:
             assert num_lookahead_slots + 1 == num_scheduler_steps or is_prefill
             self.init_multi_step(num_steps=num_lookahead_slots + 1)
 
-    def get_last_latency(self, now: float) -> float:
+    def set_last_token_time(self, now: float) -> None:
         """Sets the last token time for Request level timings."""
-        # If still in prefill phase, raise Error.
-        if self.is_prefill():
-            raise ValueError(
-                "seq_group.get_last_latency() should not be called "
-                "if the seq_group is in prefill phase.")
-
-        # Otherwise return token latency.
-        latency = now - self.metrics.last_token_time
+        # If still in prefill phase, assertion fails.
+        assert not self.is_prefill(), (
+            "seq_group.set_last_token_time() should not be called "
+            "if the seq_group is in prefill phase.")
+        self.last_token_latency = now - self.metrics.last_token_time
         self.metrics.last_token_time = now
-        return latency
+
+    def get_last_token_latency(self) -> float:
+        """Returns the latency of the last token."""
+        assert not self.is_prefill(), (
+            "seq_group.get_last_token_latency() should not be called "
+            "if the seq_group is in prefill phase.")
+        return self.last_token_latency
 
     def maybe_set_first_token_time(self, time: float) -> None:
         """Sets the first token time for Request level timings."""

From d34be24bb196cb0cce167257c97449f0cd6858f7 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 28 Dec 2024 14:14:10 +0800
Subject: [PATCH 17/48] [Model] Support InternLM2 Reward models (#11571)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/source/models/supported_models.md  |  5 +++
 tests/models/registry.py                |  2 +
 vllm/model_executor/models/internlm2.py | 60 ++++++++++++++++++++++++-
 vllm/model_executor/models/registry.py  |  1 +
 4 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 7acafda50793c..fa7102cd88063 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -450,6 +450,11 @@ of the whole prompt are extracted from the normalized hidden state corresponding
     - Example HF Models
     - :ref:`LoRA <lora-adapter>`
     - :ref:`PP <distributed-serving>`
+  * - :code:`InternLM2ForRewardModel`
+    - InternLM2-based
+    - :code:`internlm/internlm2-1_8b-reward`, :code:`internlm/internlm2-7b-reward`, etc.
+    - ✅︎
+    - ✅︎
   * - :code:`LlamaForCausalLM`
     - Llama-based
     - :code:`peiyi9979/math-shepherd-mistral-7b-prm`, etc.
diff --git a/tests/models/registry.py b/tests/models/registry.py
index f5a37420a2909..e5dfb2822745d 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -140,6 +140,8 @@ _EMBEDDING_EXAMPLE_MODELS = {
     "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
     "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),
     "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
+    "InternLM2ForRewardModel": _HfExamplesInfo("internlm/internlm2-1_8b-reward",
+                                               trust_remote_code=True),
     "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"),  # noqa: E501
     "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
     "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 41b9f110d771f..28c23edd4c8e8 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -18,14 +18,16 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors
+from vllm.sequence import IntermediateTensors, PoolerOutput
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (is_pp_missing_parameter,
@@ -433,3 +435,59 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
+
+
+class InternLM2ForRewardModel(InternLM2ForCausalLM):
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        model_type: Type[InternLM2Model] = InternLM2Model,
+    ):
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         model_type=model_type)
+
+        for attr in ("output", "logits_processor", "sampler"):
+            delattr(self, attr)
+
+        config = vllm_config.model_config.hf_config
+        self.v_head = RowParallelLinear(
+            config.hidden_size,
+            1,
+            bias=False,
+            input_is_parallel=False,
+            prefix=maybe_prefix(prefix, "v_head"),
+        )
+
+        pooler_config = vllm_config.model_config.pooler_config
+        self._pooler = Pooler.from_config_with_defaults(
+            pooler_config,
+            pooling_type=PoolingType.ALL,
+            normalize=False,
+            softmax=False,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
+        logits, _ = self.v_head(hidden_states)
+        return logits
+
+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        return self._pooler(hidden_states, pooling_metadata)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 89992de7e238d..67268eb4bb85f 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -113,6 +113,7 @@ _EMBEDDING_MODELS = {
     "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"),
     "GlmForCausalLM": ("glm", "GlmForCausalLM"),
     "GritLM": ("gritlm", "GritLM"),
+    "InternLM2ForRewardModel": ("internlm2", "InternLM2ForRewardModel"),
     "JambaForSequenceClassification": ("jamba", "JambaForSequenceClassification"),  # noqa: E501
     "LlamaModel": ("llama", "LlamaForCausalLM"),
     **{

From b7dcc003dc1ace7605946d52b7e077ba1d3bbe86 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sat, 28 Dec 2024 02:54:23 -0800
Subject: [PATCH 18/48] [Model] Remove hardcoded image tokens ids from Pixtral
 (#11582)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/pixtral.py | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index f3d66c2313198..22d29f5bbc50c 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -45,13 +45,6 @@ try:
 except ImportError:
     USE_XFORMERS_OPS = False
 
-# These token ids cannot be retrieved from model config
-# so we hardcode them here.
-PIXTRAL_12B_IMAGE_BREAK_ID = 12
-PIXTRAL_12B_IMAGE_END_ID = 13
-PIXTRAL_LARGE_IMAGE_BREAK_ID = 14
-PIXTRAL_LARGE_IMAGE_END_ID = 15
-
 
 def get_max_pixtral_image_tokens(ctx: InputContext):
     tokenizer = cached_get_tokenizer(
@@ -201,6 +194,13 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
             if key in dataclass_fields
         }
 
+        if not ("image_break_token_id" in vision_args
+                and "image_end_token_id" in vision_args):
+            raise ValueError(
+                "'image_break_token_id' and 'image_end_token_id' not found "
+                "in the vision_encoder arguments. Please download the latest "
+                "version of 'params.json' from the model repository.")
+
         self.vision_args = VisionEncoderArgs(**vision_args)
 
         # init MistralForCausalLM
@@ -240,9 +240,8 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         # NOTE: Image embeddings are split into separate tensors for each image
         # by the indices of `[IMG_END]` token.
-        image_end_condition = (image_tokens == PIXTRAL_12B_IMAGE_END_ID) | (
-            image_tokens == PIXTRAL_LARGE_IMAGE_END_ID)
-        split_indices = torch.where(image_end_condition)[0] + 1
+        image_end_mask = image_tokens == self.vision_args.image_end_token_id
+        split_indices = torch.where(image_end_mask)[0] + 1
         if len(split_indices) <= 1:
             # Do not split, return as tensor of shape [1, fs, hs]
             return image_embeds.unsqueeze(0)
@@ -265,10 +264,8 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids, inputs_embeds, multimodal_embeddings, [
                     self.vision_args.image_token_id,
-                    PIXTRAL_12B_IMAGE_END_ID,
-                    PIXTRAL_12B_IMAGE_BREAK_ID,
-                    PIXTRAL_LARGE_IMAGE_BREAK_ID,
-                    PIXTRAL_LARGE_IMAGE_END_ID,
+                    self.vision_args.image_break_token_id,
+                    self.vision_args.image_end_token_id,
                 ])
         return inputs_embeds
 
@@ -409,6 +406,8 @@ class VisionEncoderArgs:
     num_attention_heads: int
     rope_theta: float  # for rope-2D
     image_token_id: int
+    image_break_token_id: int
+    image_end_token_id: int
     adapter_bias: bool = True
 
 

From 59d6bb4c863e511e58799efac847065c28c52c8b Mon Sep 17 00:00:00 2001
From: hj-wei <hjwei_xd@163.com>
Date: Sat, 28 Dec 2024 19:17:35 +0800
Subject: [PATCH 19/48] [Hardware][AMD]: Replace HIPCC version with more
 precise ROCm version (#11515)

Signed-off-by: hjwei <hjwei_xd@163.com>
---
 setup.py | 52 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 29 insertions(+), 23 deletions(-)

diff --git a/setup.py b/setup.py
index 61d2d710aa20e..ba6953dbdc174 100644
--- a/setup.py
+++ b/setup.py
@@ -1,3 +1,4 @@
+import ctypes
 import importlib.util
 import logging
 import os
@@ -13,7 +14,7 @@ from packaging.version import Version, parse
 from setuptools import Extension, find_packages, setup
 from setuptools.command.build_ext import build_ext
 from setuptools_scm import get_version
-from torch.utils.cpp_extension import CUDA_HOME
+from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
 
 
 def load_module_from_path(module_name, path):
@@ -379,25 +380,31 @@ def _build_custom_ops() -> bool:
     return _is_cuda() or _is_hip() or _is_cpu()
 
 
-def get_hipcc_rocm_version():
-    # Run the hipcc --version command
-    result = subprocess.run(['hipcc', '--version'],
-                            stdout=subprocess.PIPE,
-                            stderr=subprocess.STDOUT,
-                            text=True)
+def get_rocm_version():
+    # Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
+    # see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21
+    try:
+        librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so"
+        if not librocm_core_file.is_file():
+            return None
+        librocm_core = ctypes.CDLL(librocm_core_file)
+        VerErrors = ctypes.c_uint32
+        get_rocm_core_version = librocm_core.getROCmVersion
+        get_rocm_core_version.restype = VerErrors
+        get_rocm_core_version.argtypes = [
+            ctypes.POINTER(ctypes.c_uint32),
+            ctypes.POINTER(ctypes.c_uint32),
+            ctypes.POINTER(ctypes.c_uint32),
+        ]
+        major = ctypes.c_uint32()
+        minor = ctypes.c_uint32()
+        patch = ctypes.c_uint32()
 
-    # Check if the command was executed successfully
-    if result.returncode != 0:
-        print("Error running 'hipcc --version'")
+        if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
+                                  ctypes.byref(patch)) == 0):
+            return "%d.%d.%d" % (major.value, minor.value, patch.value)
         return None
-
-    # Extract the version using a regular expression
-    match = re.search(r'HIP version: (\S+)', result.stdout)
-    if match:
-        # Return the version string
-        return match.group(1)
-    else:
-        print("Could not find HIP version in the output")
+    except Exception:
         return None
 
 
@@ -479,11 +486,10 @@ def get_vllm_version() -> str:
                 if "sdist" not in sys.argv:
                     version += f"{sep}cu{cuda_version_str}"
     elif _is_hip():
-        # Get the HIP version
-        hipcc_version = get_hipcc_rocm_version()
-        if hipcc_version != MAIN_CUDA_VERSION:
-            rocm_version_str = hipcc_version.replace(".", "")[:3]
-            version += f"{sep}rocm{rocm_version_str}"
+        # Get the Rocm Version
+        rocm_version = get_rocm_version() or torch.version.hip
+        if rocm_version and rocm_version != MAIN_CUDA_VERSION:
+            version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}"
     elif _is_neuron():
         # Get the Neuron version
         neuron_version = str(get_neuronxcc_version())

From 42bb201fd6f79d6ed2e28e0263ffa891cd993c4c Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sat, 28 Dec 2024 22:33:12 +0900
Subject: [PATCH 20/48] [V1][Minor] Set pin_memory=False for token_ids_cpu
 tensor (#11581)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_input_batch.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 6c4d300ec6efe..e79145300fe06 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -57,11 +57,13 @@ class InputBatch:
 
         # TODO(woosuk): This buffer could be too large if max_model_len is big.
         # Find a way to reduce the CPU memory usage.
+        # This buffer is not directly transferred to the GPU, so it does not
+        # need to be pinned.
         self.token_ids_cpu_tensor = torch.zeros(
             (max_num_reqs, max_model_len),
             device="cpu",
             dtype=torch.int32,
-            pin_memory=pin_memory,
+            pin_memory=False,
         )
         self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
         self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)

From d427e5cfda8d2536b81e6021128e71b2dbc281aa Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 28 Dec 2024 21:53:59 +0800
Subject: [PATCH 21/48] [Doc] Minor documentation fixes (#11580)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/contributing/dockerfile/dockerfile.md  | 6 +++---
 docs/source/contributing/overview.md               | 2 +-
 docs/source/getting_started/arm-installation.md    | 2 +-
 docs/source/getting_started/cpu-installation.md    | 4 ++--
 docs/source/getting_started/gaudi-installation.md  | 8 +++++---
 docs/source/getting_started/neuron-installation.md | 2 +-
 docs/source/getting_started/quickstart.md          | 4 ++--
 docs/source/getting_started/tpu-installation.md    | 2 +-
 docs/source/models/supported_models.md             | 6 +++---
 docs/source/serving/deploying_with_cerebrium.md    | 6 +++---
 docs/source/serving/deploying_with_dstack.md       | 2 +-
 docs/source/serving/distributed_serving.md         | 6 +++---
 docs/source/serving/runai_model_streamer.md        | 2 +-
 13 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md
index 6535414a7dca4..7ffec83333d7d 100644
--- a/docs/source/contributing/dockerfile/dockerfile.md
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@@ -11,11 +11,11 @@ Below is a visual representation of the multi-stage Dockerfile. The build graph
 
 The edges of the build graph represent:
 
-- FROM ... dependencies (with a solid line and a full arrow head)
+- `FROM ...` dependencies (with a solid line and a full arrow head)
 
-- COPY --from=... dependencies (with a dashed line and an empty arrow head)
+- `COPY --from=...` dependencies (with a dashed line and an empty arrow head)
 
-- RUN --mount=(.\*)from=... dependencies (with a dotted line and an empty diamond arrow head)
+- `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head)
 
   > ```{figure} ../../assets/dev/dockerfile-stages-dependency.png
   > :align: center
diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index 9dac41cff0bcb..c960790f47a13 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -34,7 +34,7 @@ pytest tests/
 ```
 
 ```{note}
-Currently, the repository does not pass the `mypy` tests.
+Currently, the repository is not fully checked by `mypy`.
 ```
 
 # Contribution Guidelines
diff --git a/docs/source/getting_started/arm-installation.md b/docs/source/getting_started/arm-installation.md
index de807e198b4f6..799b597b3ad5d 100644
--- a/docs/source/getting_started/arm-installation.md
+++ b/docs/source/getting_started/arm-installation.md
@@ -20,7 +20,7 @@ Contents:
 ## Requirements
 
 - **Operating System**: Linux or macOS
-- **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended)
+- **Compiler**: `gcc/g++ >= 12.3.0` (optional, but recommended)
 - **Instruction Set Architecture (ISA)**: NEON support is required
 
 (arm-backend-quick-start-dockerfile)=
diff --git a/docs/source/getting_started/cpu-installation.md b/docs/source/getting_started/cpu-installation.md
index b6f181ace6274..c3d3f715ed804 100644
--- a/docs/source/getting_started/cpu-installation.md
+++ b/docs/source/getting_started/cpu-installation.md
@@ -24,7 +24,7 @@ Table of contents:
 ## Requirements
 
 - OS: Linux
-- Compiler: gcc/g++>=12.3.0 (optional, recommended)
+- Compiler: `gcc/g++>=12.3.0` (optional, recommended)
 - Instruction set architecture (ISA) requirement: AVX512 (optional, recommended)
 
 (cpu-backend-quick-start-dockerfile)=
@@ -69,7 +69,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install
 
 ```{note}
 - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
-- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building.
+- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building.
 ```
 
 (env-intro)=
diff --git a/docs/source/getting_started/gaudi-installation.md b/docs/source/getting_started/gaudi-installation.md
index acf42f210dffb..447bf98084a5d 100644
--- a/docs/source/getting_started/gaudi-installation.md
+++ b/docs/source/getting_started/gaudi-installation.md
@@ -167,6 +167,8 @@ Currently in vLLM for HPU we support four execution modes, depending on selected
 In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
 ```
 
+(gaudi-bucketing-mechanism)=
+
 ### Bucketing mechanism
 
 Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
@@ -185,7 +187,7 @@ INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, ma
 INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
 ```
 
-`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling - `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
+`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling -- `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
 
 Example (with ramp-up)
 
@@ -214,7 +216,7 @@ If a request exceeds maximum bucket size in any dimension, it will be processed
 As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket.
 
 ```{note}
-Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
+Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
 ```
 
 ### Warmup
@@ -235,7 +237,7 @@ INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size
 INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
 ```
 
-This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
+This example uses the same buckets as in the [Bucketing Mechanism](#gaudi-bucketing-mechanism) section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
 
 ```{tip}
 Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
diff --git a/docs/source/getting_started/neuron-installation.md b/docs/source/getting_started/neuron-installation.md
index d6de5760cc82c..baaeeb9f53a10 100644
--- a/docs/source/getting_started/neuron-installation.md
+++ b/docs/source/getting_started/neuron-installation.md
@@ -26,7 +26,7 @@ Installation steps:
 (build-from-source-neuron)=
 
 ```{note}
-The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with vLLM >= 0.5.3. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
+The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
 ```
 
 ## Build from source
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index 165e5df146dcd..9c8b7e4f592c9 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -114,7 +114,7 @@ $         "temperature": 0
 $     }'
 ```
 
-Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` python package:
+Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package:
 
 ```python
 from openai import OpenAI
@@ -151,7 +151,7 @@ $         ]
 $     }'
 ```
 
-Alternatively, you can use the `openai` python package:
+Alternatively, you can use the `openai` Python package:
 
 ```python
 from openai import OpenAI
diff --git a/docs/source/getting_started/tpu-installation.md b/docs/source/getting_started/tpu-installation.md
index f2a949e7247d8..17eded4a51fec 100644
--- a/docs/source/getting_started/tpu-installation.md
+++ b/docs/source/getting_started/tpu-installation.md
@@ -103,7 +103,7 @@ Connect to your TPU using SSH:
 gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE
 ```
 
-Install Miniconda
+Install Miniconda:
 
 ```bash
 wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index fa7102cd88063..f6e00fa71a310 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -435,7 +435,7 @@ despite being described otherwise on its model card.
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
-:func:`vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings
+{func}`vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings
 of the whole prompt are extracted from the normalized hidden state corresponding to the last token.
 
 #### Reward Modeling (`--task reward`)
@@ -468,7 +468,7 @@ of the whole prompt are extracted from the normalized hidden state corresponding
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
-:func:`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
+{func}`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
 
 ```{important}
 For process-supervised reward models such as {code}`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
@@ -500,7 +500,7 @@ e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 1
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
-:func:`vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
+{func}`vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
 
 #### Sentence Pair Scoring (`--task score`)
 
diff --git a/docs/source/serving/deploying_with_cerebrium.md b/docs/source/serving/deploying_with_cerebrium.md
index 4863936236119..950064c8c1b10 100644
--- a/docs/source/serving/deploying_with_cerebrium.md
+++ b/docs/source/serving/deploying_with_cerebrium.md
@@ -33,7 +33,7 @@ docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
 vllm = "latest"
 ```
 
-Next, let us add our code to handle inference for the LLM of your choice(`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your main.py\`:
+Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`:
 
 ```python
 from vllm import LLM, SamplingParams
@@ -55,13 +55,13 @@ def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):
     return {"results": results}
 ```
 
-Then, run the following code to deploy it to the cloud
+Then, run the following code to deploy it to the cloud:
 
 ```console
 $ cerebrium deploy
 ```
 
-If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case /run)
+If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case` /run`)
 
 ```python
 curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
diff --git a/docs/source/serving/deploying_with_dstack.md b/docs/source/serving/deploying_with_dstack.md
index 65ef1c0016208..381f5f786ca2c 100644
--- a/docs/source/serving/deploying_with_dstack.md
+++ b/docs/source/serving/deploying_with_dstack.md
@@ -25,7 +25,7 @@ $ cd vllm-dstack
 $ dstack init
 ```
 
-Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
+Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
 
 ```yaml
 type: service
diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index c0a4b23f6dc70..7446b7c84cf46 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -8,7 +8,7 @@ Before going into the details of distributed inference and serving, let's first
 
 - **Single GPU (no distributed inference)**: If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference.
 - **Single-Node Multi-GPU (tensor parallel inference)**: If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you have 4 GPUs in a single node, you can set the tensor parallel size to 4.
-- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2.
+- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2.
 
 In short, you should increase the number of GPUs and the number of nodes until you have enough GPU memory to hold the model. The tensor parallel size should be the number of GPUs in each node, and the pipeline parallel size should be the number of nodes.
 
@@ -77,7 +77,7 @@ Then you get a ray cluster of containers. Note that you need to keep the shells
 
 Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
 
-After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
+After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
 
 ```console
 $ vllm serve /path/to/the/model/in/the/container \
@@ -85,7 +85,7 @@ $     --tensor-parallel-size 8 \
 $     --pipeline-parallel-size 2
 ```
 
-You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 16:
+You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 16:
 
 ```console
 $ vllm serve /path/to/the/model/in/the/container \
diff --git a/docs/source/serving/runai_model_streamer.md b/docs/source/serving/runai_model_streamer.md
index 1b5756a95075a..d4269050ff574 100644
--- a/docs/source/serving/runai_model_streamer.md
+++ b/docs/source/serving/runai_model_streamer.md
@@ -41,7 +41,7 @@ For reading from S3, it will be the number of client instances the host is openi
 $ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}'
 ```
 
-You can controls the size of the CPU Memory buffer to which tensors are read from the file, and limit this size.
+You can control the size of the CPU Memory buffer to which tensors are read from the file, and limit this size.
 You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit).
 
 ```console

From 328841d00294fb8226f0368cc380350b3d671d77 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 29 Dec 2024 00:55:42 +0800
Subject: [PATCH 22/48] [bugfix] interleaving sliding window for cohere2 model
 (#11583)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/models/supported_models.md      |   2 +-
 tests/models/test_initialization.py         |   4 -
 vllm/config.py                              |   2 +-
 vllm/model_executor/models/commandr.py      |  10 +-
 vllm/transformers_utils/config.py           |   7 +-
 vllm/transformers_utils/configs/__init__.py |   2 +
 vllm/transformers_utils/configs/cohere2.py  | 192 ++++++++++++++++++++
 7 files changed, 206 insertions(+), 13 deletions(-)
 create mode 100644 vllm/transformers_utils/configs/cohere2.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index f6e00fa71a310..e11befbb8dd30 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -112,7 +112,7 @@ See [this page](#generative-models) for more information on how to use generativ
     - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc.
     - ✅︎
     - ✅︎
-  * - :code:`CohereForCausalLM`,:code:`Cohere2ForCausalLM`
+  * - :code:`CohereForCausalLM`, :code:`Cohere2ForCausalLM`
     - Command-R
     - :code:`CohereForAI/c4ai-command-r-v01`, :code:`CohereForAI/c4ai-command-r7b-12-2024`, etc.
     - ✅︎
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index a4eea7f035c91..3b728f2744fca 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -1,7 +1,6 @@
 from unittest.mock import patch
 
 import pytest
-import transformers
 from transformers import PretrainedConfig
 
 from vllm import LLM
@@ -12,9 +11,6 @@ from .registry import HF_EXAMPLE_MODELS
 @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
 def test_can_initialize(model_arch):
     model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
-    if (model_arch == "Cohere2ForCausalLM"
-            and transformers.__version__ < "4.48.0"):
-        pytest.skip(reason="Model introduced in HF >= 4.48.0")
     if not model_info.is_available_online:
         pytest.skip("Model is not available online")
 
diff --git a/vllm/config.py b/vllm/config.py
index ac767bbe14be4..6ae1d4d944447 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -301,7 +301,7 @@ class ModelConfig:
         sliding_window = getattr(self.hf_text_config, "sliding_window", None)
         has_interleaved_attention = (sliding_window is not None) and (
             isinstance(sliding_window, list) or
-            (self.hf_text_config.model_type in ["gemma2"]))
+            (self.hf_text_config.model_type in ["gemma2", "cohere2"]))
 
         if (not self.disable_sliding_window and has_interleaved_attention):
             if envs.VLLM_ATTENTION_BACKEND == "XFORMERS":
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index c846e42f1b0c3..d22d1f3171463 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -172,16 +172,18 @@ class CohereAttention(nn.Module):
             is_neox_style=False,
         )
 
-        sliding_window = getattr(config, "sliding_window", None)
-        # Model v2 has sliding windows, v1 does not
-        self.v1 = sliding_window is None
+        # Model v2 has interleaved sliding windows, v1 does not
+        interleaved_sliding_window = getattr(config,
+                                             "interleaved_sliding_window",
+                                             None)
+        self.v1 = interleaved_sliding_window is None
 
         layer_idx = extract_layer_index(prefix)
         layer_has_sliding_window = (
             getattr(config, "sliding_window_pattern", False)
             and (layer_idx + 1) % self.config.sliding_window_pattern != 0)
 
-        self.sliding_window = (sliding_window
+        self.sliding_window = (interleaved_sliding_window
                                if layer_has_sliding_window else None)
 
         self.attn = Attention(self.num_heads,
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 4529cf27ef565..58417980e7b47 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -22,9 +22,9 @@ from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 # yapf conflicts with isort for this block
 # yapf: disable
-from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
-                                             EAGLEConfig, ExaoneConfig,
-                                             H2OVLChatConfig,
+from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
+                                             DbrxConfig, EAGLEConfig,
+                                             ExaoneConfig, H2OVLChatConfig,
                                              InternVLChatConfig, JAISConfig,
                                              MedusaConfig, MllamaConfig,
                                              MLPSpeculatorConfig, MPTConfig,
@@ -52,6 +52,7 @@ _CONFIG_REGISTRY_OVERRIDE_HF: Dict[str, Type[PretrainedConfig]] = {
 
 _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
     "chatglm": ChatGLMConfig,
+    "cohere2": Cohere2Config,
     "dbrx": DbrxConfig,
     "mpt": MPTConfig,
     "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index c24433cd436b4..a41a35c88b3a1 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -1,4 +1,5 @@
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
+from vllm.transformers_utils.configs.cohere2 import Cohere2Config
 from vllm.transformers_utils.configs.dbrx import DbrxConfig
 from vllm.transformers_utils.configs.eagle import EAGLEConfig
 from vllm.transformers_utils.configs.exaone import ExaoneConfig
@@ -22,6 +23,7 @@ from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
 __all__ = [
     "ChatGLMConfig",
+    "Cohere2Config",
     "DbrxConfig",
     "MPTConfig",
     "RWConfig",
diff --git a/vllm/transformers_utils/configs/cohere2.py b/vllm/transformers_utils/configs/cohere2.py
new file mode 100644
index 0000000000000..1509330fc2179
--- /dev/null
+++ b/vllm/transformers_utils/configs/cohere2.py
@@ -0,0 +1,192 @@
+# ruff: noqa
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/cohere2/configuration_cohere2.py
+from transformers import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+
+
+class Cohere2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
+    model according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 256000):
+            Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`CohereModel`]
+        hidden_size (`int`, *optional*, defaults to 8192):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22528):
+            Dimension of the MLP representations.
+        logit_scale (`float`, *optional*, defaults to 0.0625):
+            The scaling factor for the output logits.
+        num_hidden_layers (`int`, *optional*, defaults to 40):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 5):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 255001):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Size of the sliding window attention context.
+        sliding_window_pattern (`int`, *optional*, defaults to 4):
+            Pattern for the sliding window attention.
+        cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
+
+    ```python
+    >>> from transformers import Cohere2Model, Cohere2Config
+
+    >>> # Initializing a Cohere Nextmodel configuration
+    >>> configuration = Cohere2Config()
+
+    >>> # Initializing a model from the Cohere2 configuration
+    >>> model = Cohere2Model(configuration) # doctest: +SKIP
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config # doctest: +SKIP
+    ```
+    """
+
+    model_type = "cohere2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=256000,
+        hidden_size=8192,
+        intermediate_size=22528,
+        logit_scale=0.0625,
+        num_hidden_layers=40,
+        num_attention_heads=64,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=8192,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=5,
+        eos_token_id=255001,
+        tie_word_embeddings=True,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        sliding_window=4096,
+        sliding_window_pattern=4,
+        cache_implementation="hybrid",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.logit_scale = logit_scale
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.sliding_window = sliding_window
+        self.sliding_window_pattern = sliding_window_pattern
+        # Need to specify head_dim in the config so it can be used in the attention forward functions
+        self.head_dim = hidden_size // num_attention_heads
+        self.cache_implementation = cache_implementation
+
+        # Validate the correctness of rotary position embeddings parameters
+        rope_config_validation(self)
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["Cohere2Config"]

From 4fb8e329fd6f51d576bcf4b7e8907e0d83c4b5cf Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sat, 28 Dec 2024 15:51:57 -0500
Subject: [PATCH 23/48] [V1] [5/N] API Server: unify `Detokenizer` and 
 `EngineCore` input (#11545)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 tests/v1/engine/test_detokenizer.py | 57 ++++++++++++++++++-----------
 vllm/v1/engine/__init__.py          | 16 +-------
 vllm/v1/engine/async_llm.py         | 14 ++++---
 vllm/v1/engine/detokenizer.py       | 21 ++++++-----
 vllm/v1/engine/llm_engine.py        | 12 +++---
 vllm/v1/engine/processor.py         | 23 ++----------
 6 files changed, 66 insertions(+), 77 deletions(-)

diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py
index 07f343666cb5e..aeae697ca32b0 100644
--- a/tests/v1/engine/test_detokenizer.py
+++ b/tests/v1/engine/test_detokenizer.py
@@ -3,9 +3,9 @@ from typing import List
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.sampling_params import RequestOutputKind
-from vllm.v1.engine import EngineCoreOutput
-from vllm.v1.engine.detokenizer import Detokenizer, DetokenizerRequest
+from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+from vllm.v1.engine.detokenizer import Detokenizer
 
 TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
 tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
@@ -71,16 +71,22 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
 
     # Make N requests.
     requests = [
-        DetokenizerRequest(
-            request_id=f"request-{idx}",
-            prompt=prompt,
-            prompt_token_ids=prompt_tokens,
-            skip_special_tokens=False,
-            spaces_between_special_tokens=False,
-            output_kind=request_output_kind,
-            stop=[],
-            include_stop_str_in_output=False,
-        ) for idx, (
+        EngineCoreRequest(request_id=f"request-{idx}",
+                          prompt=prompt,
+                          prompt_token_ids=prompt_tokens,
+                          arrival_time=0,
+                          mm_inputs=None,
+                          mm_hashes=None,
+                          mm_placeholders=None,
+                          eos_token_id=None,
+                          lora_request=None,
+                          sampling_params=SamplingParams(
+                              skip_special_tokens=False,
+                              spaces_between_special_tokens=False,
+                              output_kind=request_output_kind,
+                              stop=[],
+                              include_stop_str_in_output=False))
+        for idx, (
             prompt,
             prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
     ]
@@ -133,18 +139,25 @@ def test_stop_string(include_stop_str_in_output: bool):
 
     # Make N requests.
     requests = [
-        DetokenizerRequest(
+        EngineCoreRequest(
             request_id=f"request-{idx}",
             prompt=prompt,
             prompt_token_ids=prompt_tokens,
-            skip_special_tokens=False,
-            spaces_between_special_tokens=False,
-            output_kind=RequestOutputKind.DELTA,
-            stop=STOP_STRINGS,
-            include_stop_str_in_output=include_stop_str_in_output,
-        ) for idx, (
-            prompt,
-            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+            arrival_time=0,
+            mm_inputs=None,
+            mm_hashes=None,
+            mm_placeholders=None,
+            eos_token_id=None,
+            lora_request=None,
+            sampling_params=SamplingParams(
+                skip_special_tokens=False,
+                spaces_between_special_tokens=False,
+                output_kind=RequestOutputKind.DELTA,
+                stop=STOP_STRINGS,
+                include_stop_str_in_output=include_stop_str_in_output,
+            )) for idx, (
+                prompt,
+                prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
     ]
 
     # Add requests to the detokenizer.
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index cc0c7ea23469a..f70464fc88298 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -6,21 +6,7 @@ import msgspec
 
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
-from vllm.sampling_params import RequestOutputKind, SamplingParams
-
-
-@dataclass
-class DetokenizerRequest:
-
-    request_id: str
-    prompt: Optional[str]
-    prompt_token_ids: List[int]
-    skip_special_tokens: bool
-    spaces_between_special_tokens: bool
-    output_kind: RequestOutputKind
-
-    stop: List[str]
-    include_stop_str_in_output: bool
+from vllm.sampling_params import SamplingParams
 
 
 @dataclass
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index da3da6dad6436..213ddaa023dbc 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -158,16 +158,18 @@ class AsyncLLM(EngineClient):
             raise ValueError(f"Request id {request_id} already running.")
         self.rid_to_queue[request_id] = asyncio.Queue()
 
-        # 2) Convert input --> DetokenizerRequest / EngineCoreRequest.
-        detokenizer_req, engine_core_req = self.processor.process_inputs(
-            request_id, prompt, params, arrival_time, lora_request,
-            trace_headers, prompt_adapter_request, priority)
+        # 2) Convert Input --> Request.
+        request = self.processor.process_inputs(request_id, prompt, params,
+                                                arrival_time, lora_request,
+                                                trace_headers,
+                                                prompt_adapter_request,
+                                                priority)
 
         # 3) Add the request to Detokenizer (this process).
-        self.detokenizer.add_request(detokenizer_req)
+        self.detokenizer.add_request(request)
 
         # 4) Add the EngineCoreRequest to EngineCore (separate process).
-        await self.engine_core.add_request_async(engine_core_req)
+        await self.engine_core.add_request_async(request)
 
         if self.log_requests:
             logger.info("Added request %s.", request_id)
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 02f34e2b54dd5..65be9e58e03c8 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -8,7 +8,7 @@ from vllm.sampling_params import RequestOutputKind
 from vllm.transformers_utils.detokenizer_utils import (
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.v1.engine import DetokenizerRequest, EngineCoreOutput
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
 
 logger = init_logger(__name__)
 
@@ -55,19 +55,19 @@ class IncrementalDetokenizer:
     def from_new_request(
         cls,
         tokenizer: AnyTokenizer,
-        request: DetokenizerRequest,
+        request: EngineCoreRequest,
     ) -> "IncrementalDetokenizer":
 
         tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens(
             tokenizer=tokenizer,
             prompt_ids=request.prompt_token_ids,
-            skip_special_tokens=request.skip_special_tokens,
+            skip_special_tokens=request.sampling_params.skip_special_tokens,
         )
 
-        stops = request.stop
+        stops = request.sampling_params.stop
         # Number of chars to hold back when stop strings are to be excluded
         # from streamed output.
-        if stops and not request.include_stop_str_in_output:
+        if stops and not request.sampling_params.include_stop_str_in_output:
             stop_buffer_length = max(len(s) for s in stops) - 1
         else:
             stop_buffer_length = 0
@@ -79,13 +79,14 @@ class IncrementalDetokenizer:
             # NOTE(Nick): could we take ownership of it though?
             token_ids=request.prompt_token_ids.copy(),
             stop=stops,
-            include_stop_str_in_output=request.include_stop_str_in_output,
+            include_stop_str_in_output=request.sampling_params.
+            include_stop_str_in_output,
             prefix_offset=prefix_offset,
             read_offset=read_offset,
-            skip_special_tokens=request.skip_special_tokens,
-            spaces_between_special_tokens=request.
+            skip_special_tokens=request.sampling_params.skip_special_tokens,
+            spaces_between_special_tokens=request.sampling_params.
             spaces_between_special_tokens,
-            output_kind=request.output_kind,
+            output_kind=request.sampling_params.output_kind,
             request_id=request.request_id,
             prompt=request.prompt,
             prompt_token_ids=request.prompt_token_ids,
@@ -227,7 +228,7 @@ class Detokenizer:
 
     def add_request(
         self,
-        request: DetokenizerRequest,
+        request: EngineCoreRequest,
     ):
         """Add new request to the Detokenizer."""
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index fc323184abc8f..a19109559eabf 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -152,15 +152,17 @@ class LLMEngine:
     ) -> None:
 
         # 1) Process raw inputs into the request.
-        detokenizer_req, engine_core_req = self.processor.process_inputs(
-            request_id, prompt, params, arrival_time, lora_request,
-            trace_headers, prompt_adapter_request, priority)
+        request = self.processor.process_inputs(request_id, prompt, params,
+                                                arrival_time, lora_request,
+                                                trace_headers,
+                                                prompt_adapter_request,
+                                                priority)
 
         # 2) Add the request to Detokenizer.
-        self.detokenizer.add_request(detokenizer_req)
+        self.detokenizer.add_request(request)
 
         # 3) Add the request to EngineCore.
-        self.engine_core.add_request(engine_core_req)
+        self.engine_core.add_request(request)
 
     def step(self) -> List[RequestOutput]:
 
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 6ee8732bc902c..5b5a5a61cea7d 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -1,5 +1,5 @@
 import time
-from typing import Mapping, Optional, Tuple, Union
+from typing import Mapping, Optional, Union
 
 from vllm.config import CacheConfig, LoRAConfig, ModelConfig
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
@@ -13,7 +13,7 @@ from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
-from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest
+from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient
 
 
@@ -62,7 +62,7 @@ class Processor:
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
-    ) -> Tuple[DetokenizerRequest, EngineCoreRequest]:
+    ) -> EngineCoreRequest:
 
         # TODO(woosuk): Support pooling models.
         # TODO(woosuk): Check max_logprobs
@@ -123,20 +123,7 @@ class Processor:
                 decoder_inputs.multi_modal_data, mm_hashes,
                 decoder_inputs.mm_processor_kwargs, precomputed_mm_inputs)
 
-        # Make Request for Detokenizer.
-        detokenizer_request = DetokenizerRequest(
-            request_id,
-            decoder_inputs.prompt,
-            decoder_inputs.prompt_token_ids,
-            sampling_params.skip_special_tokens,
-            sampling_params.spaces_between_special_tokens,
-            sampling_params.output_kind,
-            sampling_params.stop,
-            sampling_params.include_stop_str_in_output,
-        )
-
-        # Make Request for EngineCore.
-        engine_core_request = EngineCoreRequest(
+        return EngineCoreRequest(
             request_id,
             decoder_inputs.prompt,
             decoder_inputs.prompt_token_ids,
@@ -149,8 +136,6 @@ class Processor:
             lora_request,
         )
 
-        return detokenizer_request, engine_core_request
-
     def _validate_model_inputs(self, inputs: ProcessorInputs):
         if is_encoder_decoder_inputs(inputs):
             # For encoder-decoder multimodal models, the max_prompt_len

From 32b4c63f02b2ab28a49a040b1d170a903a5cd9dc Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 29 Dec 2024 15:56:22 +0800
Subject: [PATCH 24/48] [Doc] Convert list tables to MyST (#11594)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/getting_started/debugging.md      |    2 +-
 .../getting_started/gaudi-installation.md     |   37 +-
 .../getting_started/tpu-installation.md       |   51 +-
 docs/source/models/supported_models.md        | 1192 ++++++++---------
 .../source/quantization/supported_hardware.md |  227 ++--
 docs/source/serving/deploying_with_helm.md    |  407 +++---
 6 files changed, 951 insertions(+), 965 deletions(-)

diff --git a/docs/source/getting_started/debugging.md b/docs/source/getting_started/debugging.md
index 3b0029f2e88ce..19eb699572a08 100644
--- a/docs/source/getting_started/debugging.md
+++ b/docs/source/getting_started/debugging.md
@@ -197,4 +197,4 @@ if __name__ == '__main__':
 ## Known Issues
 
 - In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759).
-- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable ``NCCL_CUMEM_ENABLE=0`` to disable NCCL's ``cuMem`` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](gh-issue:5723#issuecomment-2554389656) .
+- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable `NCCL_CUMEM_ENABLE=0` to disable NCCL's `cuMem` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](gh-issue:5723#issuecomment-2554389656) .
diff --git a/docs/source/getting_started/gaudi-installation.md b/docs/source/getting_started/gaudi-installation.md
index 447bf98084a5d..1f2ee62860dec 100644
--- a/docs/source/getting_started/gaudi-installation.md
+++ b/docs/source/getting_started/gaudi-installation.md
@@ -141,26 +141,25 @@ Gaudi2 devices. Configurations that are not listed may or may not work.
 
 Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag.
 
-```{eval-rst}
-.. list-table:: vLLM execution modes
-   :widths: 25 25 50
-   :header-rows: 1
+```{list-table} vLLM execution modes
+:widths: 25 25 50
+:header-rows: 1
 
-   * - ``PT_HPU_LAZY_MODE``
-     - ``enforce_eager``
-     - execution mode
-   * - 0
-     - 0
-     - torch.compile
-   * - 0
-     - 1
-     - PyTorch eager mode
-   * - 1
-     - 0
-     - HPU Graphs
-   * - 1
-     - 1
-     - PyTorch lazy mode
+* - `PT_HPU_LAZY_MODE`
+  - `enforce_eager`
+  - execution mode
+* - 0
+  - 0
+  - torch.compile
+* - 0
+  - 1
+  - PyTorch eager mode
+* - 1
+  - 0
+  - HPU Graphs
+* - 1
+  - 1
+  - PyTorch lazy mode
 ```
 
 ```{warning}
diff --git a/docs/source/getting_started/tpu-installation.md b/docs/source/getting_started/tpu-installation.md
index 17eded4a51fec..4d3ac541c90ce 100644
--- a/docs/source/getting_started/tpu-installation.md
+++ b/docs/source/getting_started/tpu-installation.md
@@ -68,33 +68,32 @@ gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
 --service-account SERVICE_ACCOUNT
 ```
 
-```{eval-rst}
-.. list-table:: Parameter descriptions
-    :header-rows: 1
+```{list-table} Parameter descriptions
+:header-rows: 1
 
-    * - Parameter name
-      - Description
-    * - QUEUED_RESOURCE_ID
-      - The user-assigned ID of the queued resource request.
-    * - TPU_NAME
-      - The user-assigned name of the TPU which is created when the queued
-        resource request is allocated.
-    * - PROJECT_ID
-      - Your Google Cloud project
-    * - ZONE
-      - The GCP zone where you want to create your Cloud TPU. The value you use
-        depends on the version of TPUs you are using. For more information, see
-        `TPU regions and zones <https://cloud.google.com/tpu/docs/regions-zones>`_
-    * - ACCELERATOR_TYPE
-      - The TPU version you want to use. Specify the TPU version, for example
-        `v5litepod-4` specifies a v5e TPU with 4 cores. For more information,
-        see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
-    * - RUNTIME_VERSION
-      - The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
-    * - SERVICE_ACCOUNT
-      - The email address for your service account. You can find it in the IAM
-        Cloud Console under *Service Accounts*. For example:
-        `tpu-service-account@<your_project_ID>.iam.gserviceaccount.com`
+* - Parameter name
+  - Description
+* - QUEUED_RESOURCE_ID
+  - The user-assigned ID of the queued resource request.
+* - TPU_NAME
+  - The user-assigned name of the TPU which is created when the queued
+    resource request is allocated.
+* - PROJECT_ID
+  - Your Google Cloud project
+* - ZONE
+  - The GCP zone where you want to create your Cloud TPU. The value you use
+    depends on the version of TPUs you are using. For more information, see
+    `TPU regions and zones <https://cloud.google.com/tpu/docs/regions-zones>`_
+* - ACCELERATOR_TYPE
+  - The TPU version you want to use. Specify the TPU version, for example
+    `v5litepod-4` specifies a v5e TPU with 4 cores. For more information,
+    see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
+* - RUNTIME_VERSION
+  - The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
+* - SERVICE_ACCOUNT
+  - The email address for your service account. You can find it in the IAM
+    Cloud Console under *Service Accounts*. For example:
+    `tpu-service-account@<your_project_ID>.iam.gserviceaccount.com`
 ```
 
 Connect to your TPU using SSH:
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index e11befbb8dd30..518505abeb2a9 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -72,291 +72,290 @@ See [this page](#generative-models) for more information on how to use generativ
 
 #### Text Generation (`--task generate`)
 
-```{eval-rst}
-.. list-table::
-  :widths: 25 25 50 5 5
-  :header-rows: 1
+```{list-table}
+:widths: 25 25 50 5 5
+:header-rows: 1
 
-  * - Architecture
-    - Models
-    - Example HF Models
-    - :ref:`LoRA <lora-adapter>`
-    - :ref:`PP <distributed-serving>`
-  * - :code:`AquilaForCausalLM`
-    - Aquila, Aquila2
-    - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`ArcticForCausalLM`
-    - Arctic
-    - :code:`Snowflake/snowflake-arctic-base`, :code:`Snowflake/snowflake-arctic-instruct`, etc.
-    -
-    - ✅︎
-  * - :code:`BaiChuanForCausalLM`
-    - Baichuan2, Baichuan
-    - :code:`baichuan-inc/Baichuan2-13B-Chat`, :code:`baichuan-inc/Baichuan-7B`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`BloomForCausalLM`
-    - BLOOM, BLOOMZ, BLOOMChat
-    - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc.
-    -
-    - ✅︎
-  * - :code:`BartForConditionalGeneration`
-    - BART
-    - :code:`facebook/bart-base`, :code:`facebook/bart-large-cnn`, etc.
-    -
-    -
-  * - :code:`ChatGLMModel`
-    - ChatGLM
-    - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`CohereForCausalLM`, :code:`Cohere2ForCausalLM`
-    - Command-R
-    - :code:`CohereForAI/c4ai-command-r-v01`, :code:`CohereForAI/c4ai-command-r7b-12-2024`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`DbrxForCausalLM`
-    - DBRX
-    - :code:`databricks/dbrx-base`, :code:`databricks/dbrx-instruct`, etc.
-    -
-    - ✅︎
-  * - :code:`DeciLMForCausalLM`
-    - DeciLM
-    - :code:`Deci/DeciLM-7B`, :code:`Deci/DeciLM-7B-instruct`, etc.
-    -
-    - ✅︎
-  * - :code:`DeepseekForCausalLM`
-    - DeepSeek
-    - :code:`deepseek-ai/deepseek-llm-67b-base`, :code:`deepseek-ai/deepseek-llm-7b-chat` etc.
-    -
-    - ✅︎
-  * - :code:`DeepseekV2ForCausalLM`
-    - DeepSeek-V2
-    - :code:`deepseek-ai/DeepSeek-V2`, :code:`deepseek-ai/DeepSeek-V2-Chat` etc.
-    -
-    - ✅︎
-  * - :code:`DeepseekV3ForCausalLM`
-    - DeepSeek-V3
-    - :code:`deepseek-ai/DeepSeek-V3-Base`, :code:`deepseek-ai/DeepSeek-V3` etc.
-    -
-    - ✅︎
-  * - :code:`ExaoneForCausalLM`
-    - EXAONE-3
-    - :code:`LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`FalconForCausalLM`
-    - Falcon
-    - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.
-    -
-    - ✅︎
-  * - :code:`FalconMambaForCausalLM`
-    - FalconMamba
-    - :code:`tiiuae/falcon-mamba-7b`, :code:`tiiuae/falcon-mamba-7b-instruct`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`GemmaForCausalLM`
-    - Gemma
-    - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`Gemma2ForCausalLM`
-    - Gemma2
-    - :code:`google/gemma-2-9b`, :code:`google/gemma-2-27b`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`GlmForCausalLM`
-    - GLM-4
-    - :code:`THUDM/glm-4-9b-chat-hf`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`GPT2LMHeadModel`
-    - GPT-2
-    - :code:`gpt2`, :code:`gpt2-xl`, etc.
-    -
-    - ✅︎
-  * - :code:`GPTBigCodeForCausalLM`
-    - StarCoder, SantaCoder, WizardCoder
-    - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`GPTJForCausalLM`
-    - GPT-J
-    - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc.
-    -
-    - ✅︎
-  * - :code:`GPTNeoXForCausalLM`
-    - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
-    - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc.
-    -
-    - ✅︎
-  * - :code:`GraniteForCausalLM`
-    - Granite 3.0, Granite 3.1, PowerLM
-    - :code:`ibm-granite/granite-3.0-2b-base`, :code:`ibm-granite/granite-3.1-8b-instruct`, :code:`ibm/PowerLM-3b`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`GraniteMoeForCausalLM`
-    - Granite 3.0 MoE, PowerMoE
-    - :code:`ibm-granite/granite-3.0-1b-a400m-base`, :code:`ibm-granite/granite-3.0-3b-a800m-instruct`, :code:`ibm/PowerMoE-3b`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`GritLM`
-    - GritLM
-    - :code:`parasail-ai/GritLM-7B-vllm`.
-    - ✅︎
-    - ✅︎
-  * - :code:`InternLMForCausalLM`
-    - InternLM
-    - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`InternLM2ForCausalLM`
-    - InternLM2
-    - :code:`internlm/internlm2-7b`, :code:`internlm/internlm2-chat-7b`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`JAISLMHeadModel`
-    - Jais
-    - :code:`inceptionai/jais-13b`, :code:`inceptionai/jais-13b-chat`, :code:`inceptionai/jais-30b-v3`, :code:`inceptionai/jais-30b-chat-v3`, etc.
-    -
-    - ✅︎
-  * - :code:`JambaForCausalLM`
-    - Jamba
-    - :code:`ai21labs/AI21-Jamba-1.5-Large`, :code:`ai21labs/AI21-Jamba-1.5-Mini`, :code:`ai21labs/Jamba-v0.1`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`LlamaForCausalLM`
-    - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi
-    - :code:`meta-llama/Meta-Llama-3.1-405B-Instruct`, :code:`meta-llama/Meta-Llama-3.1-70B`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-70b-hf`, :code:`01-ai/Yi-34B`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`MambaForCausalLM`
-    - Mamba
-    - :code:`state-spaces/mamba-130m-hf`, :code:`state-spaces/mamba-790m-hf`, :code:`state-spaces/mamba-2.8b-hf`, etc.
-    -
-    - ✅︎
-  * - :code:`MiniCPMForCausalLM`
-    - MiniCPM
-    - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, :code:`openbmb/MiniCPM-S-1B-sft`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`MiniCPM3ForCausalLM`
-    - MiniCPM3
-    - :code:`openbmb/MiniCPM3-4B`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`MistralForCausalLM`
-    - Mistral, Mistral-Instruct
-    - :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`MixtralForCausalLM`
-    - Mixtral-8x7B, Mixtral-8x7B-Instruct
-    - :code:`mistralai/Mixtral-8x7B-v0.1`, :code:`mistralai/Mixtral-8x7B-Instruct-v0.1`, :code:`mistral-community/Mixtral-8x22B-v0.1`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`MPTForCausalLM`
-    - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
-    - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc.
-    -
-    - ✅︎
-  * - :code:`NemotronForCausalLM`
-    - Nemotron-3, Nemotron-4, Minitron
-    - :code:`nvidia/Minitron-8B-Base`, :code:`mgoin/Nemotron-4-340B-Base-hf-FP8`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`OLMoForCausalLM`
-    - OLMo
-    - :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc.
-    -
-    - ✅︎
-  * - :code:`OLMo2ForCausalLM`
-    - OLMo2
-    - :code:`allenai/OLMo2-7B-1124`, etc.
-    -
-    - ✅︎
-  * - :code:`OLMoEForCausalLM`
-    - OLMoE
-    - :code:`allenai/OLMoE-1B-7B-0924`, :code:`allenai/OLMoE-1B-7B-0924-Instruct`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`OPTForCausalLM`
-    - OPT, OPT-IML
-    - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc.
-    -
-    - ✅︎
-  * - :code:`OrionForCausalLM`
-    - Orion
-    - :code:`OrionStarAI/Orion-14B-Base`, :code:`OrionStarAI/Orion-14B-Chat`, etc.
-    -
-    - ✅︎
-  * - :code:`PhiForCausalLM`
-    - Phi
-    - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`Phi3ForCausalLM`
-    - Phi-3
-    - :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, :code:`microsoft/Phi-3-medium-128k-instruct`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`Phi3SmallForCausalLM`
-    - Phi-3-Small
-    - :code:`microsoft/Phi-3-small-8k-instruct`, :code:`microsoft/Phi-3-small-128k-instruct`, etc.
-    -
-    - ✅︎
-  * - :code:`PhiMoEForCausalLM`
-    - Phi-3.5-MoE
-    - :code:`microsoft/Phi-3.5-MoE-instruct`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`PersimmonForCausalLM`
-    - Persimmon
-    - :code:`adept/persimmon-8b-base`, :code:`adept/persimmon-8b-chat`, etc.
-    -
-    - ✅︎
-  * - :code:`QWenLMHeadModel`
-    - Qwen
-    - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`Qwen2ForCausalLM`
-    - Qwen2
-    - :code:`Qwen/QwQ-32B-Preview`, :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`Qwen2MoeForCausalLM`
-    - Qwen2MoE
-    - :code:`Qwen/Qwen1.5-MoE-A2.7B`, :code:`Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.
-    -
-    - ✅︎
-  * - :code:`StableLmForCausalLM`
-    - StableLM
-    - :code:`stabilityai/stablelm-3b-4e1t`, :code:`stabilityai/stablelm-base-alpha-7b-v2`, etc.
-    -
-    - ✅︎
-  * - :code:`Starcoder2ForCausalLM`
-    - Starcoder2
-    - :code:`bigcode/starcoder2-3b`, :code:`bigcode/starcoder2-7b`, :code:`bigcode/starcoder2-15b`, etc.
-    -
-    - ✅︎
-  * - :code:`SolarForCausalLM`
-    - Solar Pro
-    - :code:`upstage/solar-pro-preview-instruct`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`TeleChat2ForCausalLM`
-    - TeleChat2
-    - :code:`TeleAI/TeleChat2-3B`, :code:`TeleAI/TeleChat2-7B`, :code:`TeleAI/TeleChat2-35B`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`XverseForCausalLM`
-    - XVERSE
-    - :code:`xverse/XVERSE-7B-Chat`, :code:`xverse/XVERSE-13B-Chat`, :code:`xverse/XVERSE-65B-Chat`, etc.
-    - ✅︎
-    - ✅︎
+* - Architecture
+  - Models
+  - Example HF Models
+  - [LoRA](#lora-adapter)
+  - [PP](#distributed-serving)
+* - `AquilaForCausalLM`
+  - Aquila, Aquila2
+  - `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.
+  - ✅︎
+  - ✅︎
+* - `ArcticForCausalLM`
+  - Arctic
+  - `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc.
+  -
+  - ✅︎
+* - `BaiChuanForCausalLM`
+  - Baichuan2, Baichuan
+  - `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.
+  - ✅︎
+  - ✅︎
+* - `BloomForCausalLM`
+  - BLOOM, BLOOMZ, BLOOMChat
+  - `bigscience/bloom`, `bigscience/bloomz`, etc.
+  -
+  - ✅︎
+* - `BartForConditionalGeneration`
+  - BART
+  - `facebook/bart-base`, `facebook/bart-large-cnn`, etc.
+  -
+  -
+* - `ChatGLMModel`
+  - ChatGLM
+  - `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.
+  - ✅︎
+  - ✅︎
+* - `CohereForCausalLM`, `Cohere2ForCausalLM`
+  - Command-R
+  - `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc.
+  - ✅︎
+  - ✅︎
+* - `DbrxForCausalLM`
+  - DBRX
+  - `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc.
+  -
+  - ✅︎
+* - `DeciLMForCausalLM`
+  - DeciLM
+  - `Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.
+  -
+  - ✅︎
+* - `DeepseekForCausalLM`
+  - DeepSeek
+  - `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc.
+  -
+  - ✅︎
+* - `DeepseekV2ForCausalLM`
+  - DeepSeek-V2
+  - `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc.
+  -
+  - ✅︎
+* - `DeepseekV3ForCausalLM`
+  - DeepSeek-V3
+  - `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc.
+  -
+  - ✅︎
+* - `ExaoneForCausalLM`
+  - EXAONE-3
+  - `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc.
+  - ✅︎
+  - ✅︎
+* - `FalconForCausalLM`
+  - Falcon
+  - `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.
+  -
+  - ✅︎
+* - `FalconMambaForCausalLM`
+  - FalconMamba
+  - `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc.
+  - ✅︎
+  - ✅︎
+* - `GemmaForCausalLM`
+  - Gemma
+  - `google/gemma-2b`, `google/gemma-7b`, etc.
+  - ✅︎
+  - ✅︎
+* - `Gemma2ForCausalLM`
+  - Gemma2
+  - `google/gemma-2-9b`, `google/gemma-2-27b`, etc.
+  - ✅︎
+  - ✅︎
+* - `GlmForCausalLM`
+  - GLM-4
+  - `THUDM/glm-4-9b-chat-hf`, etc.
+  - ✅︎
+  - ✅︎
+* - `GPT2LMHeadModel`
+  - GPT-2
+  - `gpt2`, `gpt2-xl`, etc.
+  -
+  - ✅︎
+* - `GPTBigCodeForCausalLM`
+  - StarCoder, SantaCoder, WizardCoder
+  - `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc.
+  - ✅︎
+  - ✅︎
+* - `GPTJForCausalLM`
+  - GPT-J
+  - `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.
+  -
+  - ✅︎
+* - `GPTNeoXForCausalLM`
+  - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
+  - `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.
+  -
+  - ✅︎
+* - `GraniteForCausalLM`
+  - Granite 3.0, Granite 3.1, PowerLM
+  - `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc.
+  - ✅︎
+  - ✅︎
+* - `GraniteMoeForCausalLM`
+  - Granite 3.0 MoE, PowerMoE
+  - `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc.
+  - ✅︎
+  - ✅︎
+* - `GritLM`
+  - GritLM
+  - `parasail-ai/GritLM-7B-vllm`.
+  - ✅︎
+  - ✅︎
+* - `InternLMForCausalLM`
+  - InternLM
+  - `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.
+  - ✅︎
+  - ✅︎
+* - `InternLM2ForCausalLM`
+  - InternLM2
+  - `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.
+  - ✅︎
+  - ✅︎
+* - `JAISLMHeadModel`
+  - Jais
+  - `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc.
+  -
+  - ✅︎
+* - `JambaForCausalLM`
+  - Jamba
+  - `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc.
+  - ✅︎
+  - ✅︎
+* - `LlamaForCausalLM`
+  - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi
+  - `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc.
+  - ✅︎
+  - ✅︎
+* - `MambaForCausalLM`
+  - Mamba
+  - `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc.
+  -
+  - ✅︎
+* - `MiniCPMForCausalLM`
+  - MiniCPM
+  - `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc.
+  - ✅︎
+  - ✅︎
+* - `MiniCPM3ForCausalLM`
+  - MiniCPM3
+  - `openbmb/MiniCPM3-4B`, etc.
+  - ✅︎
+  - ✅︎
+* - `MistralForCausalLM`
+  - Mistral, Mistral-Instruct
+  - `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.
+  - ✅︎
+  - ✅︎
+* - `MixtralForCausalLM`
+  - Mixtral-8x7B, Mixtral-8x7B-Instruct
+  - `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.
+  - ✅︎
+  - ✅︎
+* - `MPTForCausalLM`
+  - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
+  - `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc.
+  -
+  - ✅︎
+* - `NemotronForCausalLM`
+  - Nemotron-3, Nemotron-4, Minitron
+  - `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc.
+  - ✅︎
+  - ✅︎
+* - `OLMoForCausalLM`
+  - OLMo
+  - `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.
+  -
+  - ✅︎
+* - `OLMo2ForCausalLM`
+  - OLMo2
+  - `allenai/OLMo2-7B-1124`, etc.
+  -
+  - ✅︎
+* - `OLMoEForCausalLM`
+  - OLMoE
+  - `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc.
+  - ✅︎
+  - ✅︎
+* - `OPTForCausalLM`
+  - OPT, OPT-IML
+  - `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.
+  -
+  - ✅︎
+* - `OrionForCausalLM`
+  - Orion
+  - `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.
+  -
+  - ✅︎
+* - `PhiForCausalLM`
+  - Phi
+  - `microsoft/phi-1_5`, `microsoft/phi-2`, etc.
+  - ✅︎
+  - ✅︎
+* - `Phi3ForCausalLM`
+  - Phi-3
+  - `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.
+  - ✅︎
+  - ✅︎
+* - `Phi3SmallForCausalLM`
+  - Phi-3-Small
+  - `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc.
+  -
+  - ✅︎
+* - `PhiMoEForCausalLM`
+  - Phi-3.5-MoE
+  - `microsoft/Phi-3.5-MoE-instruct`, etc.
+  - ✅︎
+  - ✅︎
+* - `PersimmonForCausalLM`
+  - Persimmon
+  - `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc.
+  -
+  - ✅︎
+* - `QWenLMHeadModel`
+  - Qwen
+  - `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.
+  - ✅︎
+  - ✅︎
+* - `Qwen2ForCausalLM`
+  - Qwen2
+  - `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc.
+  - ✅︎
+  - ✅︎
+* - `Qwen2MoeForCausalLM`
+  - Qwen2MoE
+  - `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.
+  -
+  - ✅︎
+* - `StableLmForCausalLM`
+  - StableLM
+  - `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.
+  -
+  - ✅︎
+* - `Starcoder2ForCausalLM`
+  - Starcoder2
+  - `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.
+  -
+  - ✅︎
+* - `SolarForCausalLM`
+  - Solar Pro
+  - `upstage/solar-pro-preview-instruct`, etc.
+  - ✅︎
+  - ✅︎
+* - `TeleChat2ForCausalLM`
+  - TeleChat2
+  - `TeleAI/TeleChat2-3B`, `TeleAI/TeleChat2-7B`, `TeleAI/TeleChat2-35B`, etc.
+  - ✅︎
+  - ✅︎
+* - `XverseForCausalLM`
+  - XVERSE
+  - `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
+  - ✅︎
+  - ✅︎
 ```
 
 ```{note}
@@ -374,51 +373,50 @@ you should explicitly specify the task type to ensure that the model is used in
 
 #### Text Embedding (`--task embed`)
 
-```{eval-rst}
-.. list-table::
-  :widths: 25 25 50 5 5
-  :header-rows: 1
+```{list-table}
+:widths: 25 25 50 5 5
+:header-rows: 1
 
-  * - Architecture
-    - Models
-    - Example HF Models
-    - :ref:`LoRA <lora-adapter>`
-    - :ref:`PP <distributed-serving>`
-  * - :code:`BertModel`
-    - BERT-based
-    - :code:`BAAI/bge-base-en-v1.5`, etc.
-    -
-    -
-  * - :code:`Gemma2Model`
-    - Gemma2-based
-    - :code:`BAAI/bge-multilingual-gemma2`, etc.
-    -
-    - ✅︎
-  * - :code:`GritLM`
-    - GritLM
-    - :code:`parasail-ai/GritLM-7B-vllm`.
-    - ✅︎
-    - ✅︎
-  * - :code:`LlamaModel`, :code:`LlamaForCausalLM`, :code:`MistralModel`, etc.
-    - Llama-based
-    - :code:`intfloat/e5-mistral-7b-instruct`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`Qwen2Model`, :code:`Qwen2ForCausalLM`
-    - Qwen2-based
-    - :code:`ssmits/Qwen2-7B-Instruct-embed-base` (see note), :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`RobertaModel`, :code:`RobertaForMaskedLM`
-    - RoBERTa-based
-    - :code:`sentence-transformers/all-roberta-large-v1`, :code:`sentence-transformers/all-roberta-large-v1`, etc.
-    -
-    -
-  * - :code:`XLMRobertaModel`
-    - XLM-RoBERTa-based
-    - :code:`intfloat/multilingual-e5-large`, etc.
-    -
-    -
+* - Architecture
+  - Models
+  - Example HF Models
+  - [LoRA](#lora-adapter)
+  - [PP](#distributed-serving)
+* - `BertModel`
+  - BERT-based
+  - `BAAI/bge-base-en-v1.5`, etc.
+  -
+  -
+* - `Gemma2Model`
+  - Gemma2-based
+  - `BAAI/bge-multilingual-gemma2`, etc.
+  -
+  - ✅︎
+* - `GritLM`
+  - GritLM
+  - `parasail-ai/GritLM-7B-vllm`.
+  - ✅︎
+  - ✅︎
+* - `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc.
+  - Llama-based
+  - `intfloat/e5-mistral-7b-instruct`, etc.
+  - ✅︎
+  - ✅︎
+* - `Qwen2Model`, `Qwen2ForCausalLM`
+  - Qwen2-based
+  - `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc.
+  - ✅︎
+  - ✅︎
+* - `RobertaModel`, `RobertaForMaskedLM`
+  - RoBERTa-based
+  - `sentence-transformers/all-roberta-large-v1`, `sentence-transformers/all-roberta-large-v1`, etc.
+  -
+  -
+* - `XLMRobertaModel`
+  - XLM-RoBERTa-based
+  - `intfloat/multilingual-e5-large`, etc.
+  -
+  -
 ```
 
 ```{note}
@@ -440,31 +438,30 @@ of the whole prompt are extracted from the normalized hidden state corresponding
 
 #### Reward Modeling (`--task reward`)
 
-```{eval-rst}
-.. list-table::
-  :widths: 25 25 50 5 5
-  :header-rows: 1
+```{list-table}
+:widths: 25 25 50 5 5
+:header-rows: 1
 
-  * - Architecture
-    - Models
-    - Example HF Models
-    - :ref:`LoRA <lora-adapter>`
-    - :ref:`PP <distributed-serving>`
-  * - :code:`InternLM2ForRewardModel`
-    - InternLM2-based
-    - :code:`internlm/internlm2-1_8b-reward`, :code:`internlm/internlm2-7b-reward`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`LlamaForCausalLM`
-    - Llama-based
-    - :code:`peiyi9979/math-shepherd-mistral-7b-prm`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`Qwen2ForRewardModel`
-    - Qwen2-based
-    - :code:`Qwen/Qwen2.5-Math-RM-72B`, etc.
-    - ✅︎
-    - ✅︎
+* - Architecture
+  - Models
+  - Example HF Models
+  - [LoRA](#lora-adapter)
+  - [PP](#distributed-serving)
+* - `InternLM2ForRewardModel`
+  - InternLM2-based
+  - `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc.
+  - ✅︎
+  - ✅︎
+* - `LlamaForCausalLM`
+  - Llama-based
+  - `peiyi9979/math-shepherd-mistral-7b-prm`, etc.
+  - ✅︎
+  - ✅︎
+* - `Qwen2ForRewardModel`
+  - Qwen2-based
+  - `Qwen/Qwen2.5-Math-RM-72B`, etc.
+  - ✅︎
+  - ✅︎
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
@@ -477,26 +474,25 @@ e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 1
 
 #### Classification (`--task classify`)
 
-```{eval-rst}
-.. list-table::
-  :widths: 25 25 50 5 5
-  :header-rows: 1
+```{list-table}
+:widths: 25 25 50 5 5
+:header-rows: 1
 
-  * - Architecture
-    - Models
-    - Example HF Models
-    - :ref:`LoRA <lora-adapter>`
-    - :ref:`PP <distributed-serving>`
-  * - :code:`JambaForSequenceClassification`
-    - Jamba
-    - :code:`ai21labs/Jamba-tiny-reward-dev`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`Qwen2ForSequenceClassification`
-    - Qwen2-based
-    - :code:`jason9693/Qwen2.5-1.5B-apeach`, etc.
-    - ✅︎
-    - ✅︎
+* - Architecture
+  - Models
+  - Example HF Models
+  - [LoRA](#lora-adapter)
+  - [PP](#distributed-serving)
+* - `JambaForSequenceClassification`
+  - Jamba
+  - `ai21labs/Jamba-tiny-reward-dev`, etc.
+  - ✅︎
+  - ✅︎
+* - `Qwen2ForSequenceClassification`
+  - Qwen2-based
+  - `jason9693/Qwen2.5-1.5B-apeach`, etc.
+  - ✅︎
+  - ✅︎
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
@@ -504,31 +500,30 @@ If your model is not in the above list, we will try to automatically convert the
 
 #### Sentence Pair Scoring (`--task score`)
 
-```{eval-rst}
-.. list-table::
-  :widths: 25 25 50 5 5
-  :header-rows: 1
+```{list-table}
+:widths: 25 25 50 5 5
+:header-rows: 1
 
-  * - Architecture
-    - Models
-    - Example HF Models
-    - :ref:`LoRA <lora-adapter>`
-    - :ref:`PP <distributed-serving>`
-  * - :code:`BertForSequenceClassification`
-    - BERT-based
-    - :code:`cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.
-    -
-    -
-  * - :code:`RobertaForSequenceClassification`
-    - RoBERTa-based
-    - :code:`cross-encoder/quora-roberta-base`, etc.
-    -
-    -
-  * - :code:`XLMRobertaForSequenceClassification`
-    - XLM-RoBERTa-based
-    - :code:`BAAI/bge-reranker-v2-m3`, etc.
-    -
-    -
+* - Architecture
+  - Models
+  - Example HF Models
+  - [LoRA](#lora-adapter)
+  - [PP](#distributed-serving)
+* - `BertForSequenceClassification`
+  - BERT-based
+  - `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.
+  -
+  -
+* - `RobertaForSequenceClassification`
+  - RoBERTa-based
+  - `cross-encoder/quora-roberta-base`, etc.
+  -
+  -
+* - `XLMRobertaForSequenceClassification`
+  - XLM-RoBERTa-based
+  - `BAAI/bge-reranker-v2-m3`, etc.
+  -
+  -
 ```
 
 (supported-mm-models)=
@@ -558,186 +553,182 @@ See [this page](#generative-models) for more information on how to use generativ
 
 #### Text Generation (`--task generate`)
 
-```{eval-rst}
-.. list-table::
-  :widths: 25 25 15 20 5 5 5
-  :header-rows: 1
+```{list-table}
+:widths: 25 25 15 20 5 5 5
+:header-rows: 1
 
-  * - Architecture
-    - Models
-    - Inputs
-    - Example HF Models
-    - :ref:`LoRA <lora-adapter>`
-    - :ref:`PP <distributed-serving>`
-    - V1
-  * - :code:`AriaForConditionalGeneration`
-    - Aria
-    - T + I
-    - :code:`rhymes-ai/Aria`
-    -
-    - ✅︎
-    -
-  * - :code:`Blip2ForConditionalGeneration`
-    - BLIP-2
-    - T + I\ :sup:`E`
-    - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc.
-    -
-    - ✅︎
-    -
-  * - :code:`ChameleonForConditionalGeneration`
-    - Chameleon
-    - T + I
-    - :code:`facebook/chameleon-7b` etc.
-    -
-    - ✅︎
-    -
-  * - :code:`FuyuForCausalLM`
-    - Fuyu
-    - T + I
-    - :code:`adept/fuyu-8b` etc.
-    -
-    - ✅︎
-    -
-  * - :code:`ChatGLMModel`
-    - GLM-4V
-    - T + I
-    - :code:`THUDM/glm-4v-9b` etc.
-    - ✅︎
-    - ✅︎
-    -
-  * - :code:`H2OVLChatModel`
-    - H2OVL
-    - T + I\ :sup:`E+`
-    - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc.
-    -
-    - ✅︎
-    -
-  * - :code:`Idefics3ForConditionalGeneration`
-    - Idefics3
-    - T + I
-    - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc.
-    - ✅︎
-    -
-    -
-  * - :code:`InternVLChatModel`
-    - InternVL 2.5, Mono-InternVL, InternVL 2.0
-    - T + I\ :sup:`E+`
-    - :code:`OpenGVLab/InternVL2_5-4B`, :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, etc.
-    -
-    - ✅︎
-    - ✅︎
-  * - :code:`LlavaForConditionalGeneration`
-    - LLaVA-1.5
-    - T + I\ :sup:`E+`
-    - :code:`llava-hf/llava-1.5-7b-hf`, :code:`TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.
-    -
-    - ✅︎
-    - ✅︎
-  * - :code:`LlavaNextForConditionalGeneration`
-    - LLaVA-NeXT
-    - T + I\ :sup:`E+`
-    - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
-    -
-    - ✅︎
-    -
-  * - :code:`LlavaNextVideoForConditionalGeneration`
-    - LLaVA-NeXT-Video
-    - T + V
-    - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
-    -
-    - ✅︎
-    -
-  * - :code:`LlavaOnevisionForConditionalGeneration`
-    - LLaVA-Onevision
-    - T + I\ :sup:`+` + V\ :sup:`+`
-    - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
-    -
-    - ✅︎
-    -
-  * - :code:`MiniCPMV`
-    - MiniCPM-V
-    - T + I\ :sup:`E+`
-    - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc.
-    - ✅︎
-    - ✅︎
-    -
-  * - :code:`MllamaForConditionalGeneration`
-    - Llama 3.2
-    - T + I\ :sup:`+`
-    - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc.
-    -
-    -
-    -
-  * - :code:`MolmoForCausalLM`
-    - Molmo
-    - T + I
-    - :code:`allenai/Molmo-7B-D-0924`, :code:`allenai/Molmo-72B-0924`, etc.
-    -
-    - ✅︎
-    - ✅︎
-  * - :code:`NVLM_D_Model`
-    - NVLM-D 1.0
-    - T + I\ :sup:`E+`
-    - :code:`nvidia/NVLM-D-72B`, etc.
-    -
-    - ✅︎
-    - ✅︎
-  * - :code:`PaliGemmaForConditionalGeneration`
-    - PaliGemma, PaliGemma 2
-    - T + I\ :sup:`E`
-    - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, :code:`google/paligemma2-3b-ft-docci-448`, etc.
-    -
-    - ✅︎
-    -
-  * - :code:`Phi3VForCausalLM`
-    - Phi-3-Vision, Phi-3.5-Vision
-    - T + I\ :sup:`E+`
-    - :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc.
-    -
-    - ✅︎
-    - ✅︎
-  * - :code:`PixtralForConditionalGeneration`
-    - Pixtral
-    - T + I\ :sup:`+`
-    - :code:`mistralai/Pixtral-12B-2409`, :code:`mistral-community/pixtral-12b` etc.
-    -
-    - ✅︎
-    - ✅︎
-  * - :code:`QWenLMHeadModel`
-    - Qwen-VL
-    - T + I\ :sup:`E+`
-    - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
-    - ✅︎
-    - ✅︎
-    -
-  * - :code:`Qwen2AudioForConditionalGeneration`
-    - Qwen2-Audio
-    - T + A\ :sup:`+`
-    - :code:`Qwen/Qwen2-Audio-7B-Instruct`
-    -
-    - ✅︎
-    -
-  * - :code:`Qwen2VLForConditionalGeneration`
-    - Qwen2-VL
-    - T + I\ :sup:`E+` + V\ :sup:`E+`
-    - :code:`Qwen/QVQ-72B-Preview`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
-    - ✅︎
-    - ✅︎
-    -
-  * - :code:`UltravoxModel`
-    - Ultravox
-    - T + A\ :sup:`E+`
-    - :code:`fixie-ai/ultravox-v0_3`
-    -
-    - ✅︎
-    -
+* - Architecture
+  - Models
+  - Inputs
+  - Example HF Models
+  - [LoRA](#lora-adapter)
+  - [PP](#distributed-serving)
+  - [V1](gh-issue:8779)
+* - `AriaForConditionalGeneration`
+  - Aria
+  - T + I
+  - `rhymes-ai/Aria`
+  -
+  - ✅︎
+  -
+* - `Blip2ForConditionalGeneration`
+  - BLIP-2
+  - T + I<sup>E</sup>
+  - `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc.
+  -
+  - ✅︎
+  -
+* - `ChameleonForConditionalGeneration`
+  - Chameleon
+  - T + I
+  - `facebook/chameleon-7b` etc.
+  -
+  - ✅︎
+  -
+* - `FuyuForCausalLM`
+  - Fuyu
+  - T + I
+  - `adept/fuyu-8b` etc.
+  -
+  - ✅︎
+  -
+* - `ChatGLMModel`
+  - GLM-4V
+  - T + I
+  - `THUDM/glm-4v-9b` etc.
+  - ✅︎
+  - ✅︎
+  -
+* - `H2OVLChatModel`
+  - H2OVL
+  - T + I<sup>E+</sup>
+  - `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.
+  -
+  - ✅︎
+  -
+* - `Idefics3ForConditionalGeneration`
+  - Idefics3
+  - T + I
+  - `HuggingFaceM4/Idefics3-8B-Llama3` etc.
+  - ✅︎
+  -
+  -
+* - `InternVLChatModel`
+  - InternVL 2.5, Mono-InternVL, InternVL 2.0
+  - T + I<sup>E+</sup>
+  - `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.
+  -
+  - ✅︎
+  - ✅︎
+* - `LlavaForConditionalGeneration`
+  - LLaVA-1.5
+  - T + I<sup>E+</sup>
+  - `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.
+  -
+  - ✅︎
+  - ✅︎
+* - `LlavaNextForConditionalGeneration`
+  - LLaVA-NeXT
+  - T + I<sup>E+</sup>
+  - `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
+  -
+  - ✅︎
+  -
+* - `LlavaNextVideoForConditionalGeneration`
+  - LLaVA-NeXT-Video
+  - T + V
+  - `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
+  -
+  - ✅︎
+  -
+* - `LlavaOnevisionForConditionalGeneration`
+  - LLaVA-Onevision
+  - T + I<sup>+</sup> + V<sup>+</sup>
+  - `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
+  -
+  - ✅︎
+  -
+* - `MiniCPMV`
+  - MiniCPM-V
+  - T + I<sup>E+</sup>
+  - `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc.
+  - ✅︎
+  - ✅︎
+  -
+* - `MllamaForConditionalGeneration`
+  - Llama 3.2
+  - T + I<sup>+</sup>
+  - `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc.
+  -
+  -
+  -
+* - `MolmoForCausalLM`
+  - Molmo
+  - T + I
+  - `allenai/Molmo-7B-D-0924`, `allenai/Molmo-72B-0924`, etc.
+  -
+  - ✅︎
+  - ✅︎
+* - `NVLM_D_Model`
+  - NVLM-D 1.0
+  - T + I<sup>E+</sup>
+  - `nvidia/NVLM-D-72B`, etc.
+  -
+  - ✅︎
+  - ✅︎
+* - `PaliGemmaForConditionalGeneration`
+  - PaliGemma, PaliGemma 2
+  - T + I<sup>E</sup>
+  - `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.
+  -
+  - ✅︎
+  -
+* - `Phi3VForCausalLM`
+  - Phi-3-Vision, Phi-3.5-Vision
+  - T + I<sup>E+</sup>
+  - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct` etc.
+  -
+  - ✅︎
+  - ✅︎
+* - `PixtralForConditionalGeneration`
+  - Pixtral
+  - T + I<sup>+</sup>
+  - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` etc.
+  -
+  - ✅︎
+  - ✅︎
+* - `QWenLMHeadModel`
+  - Qwen-VL
+  - T + I<sup>E+</sup>
+  - `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc.
+  - ✅︎
+  - ✅︎
+  -
+* - `Qwen2AudioForConditionalGeneration`
+  - Qwen2-Audio
+  - T + A<sup>+</sup>
+  - `Qwen/Qwen2-Audio-7B-Instruct`
+  -
+  - ✅︎
+  -
+* - `Qwen2VLForConditionalGeneration`
+  - Qwen2-VL
+  - T + I<sup>E+</sup> + V<sup>E+</sup>
+  - `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc.
+  - ✅︎
+  - ✅︎
+  -
+* - `UltravoxModel`
+  - Ultravox
+  - T + A<sup>E+</sup>
+  - `fixie-ai/ultravox-v0_3`
+  -
+  - ✅︎
+  -
 ```
 
-```{eval-rst}
-:sup:`E` Pre-computed embeddings can be inputted for this modality.
-
-:sup:`+` Multiple items can be inputted per text prompt for this modality.
-```
+<sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
+<sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
 ````{important}
 To enable multiple multi-modal items per text prompt, you have to set {code}`limit_mm_per_prompt` (offline inference)
@@ -787,38 +778,37 @@ To get the best results, you should use pooling models that are specifically tra
 
 The following table lists those that are tested in vLLM.
 
-```{eval-rst}
-.. list-table::
-  :widths: 25 25 15 25 5 5
-  :header-rows: 1
+```{list-table}
+:widths: 25 25 15 25 5 5
+:header-rows: 1
 
-  * - Architecture
-    - Models
-    - Inputs
-    - Example HF Models
-    - :ref:`LoRA <lora-adapter>`
-    - :ref:`PP <distributed-serving>`
-  * - :code:`LlavaNextForConditionalGeneration`
-    - LLaVA-NeXT-based
-    - T / I
-    - :code:`royokong/e5-v`
-    -
-    - ✅︎
-  * - :code:`Phi3VForCausalLM`
-    - Phi-3-Vision-based
-    - T + I
-    - :code:`TIGER-Lab/VLM2Vec-Full`
-    - 🚧
-    - ✅︎
-  * - :code:`Qwen2VLForConditionalGeneration`
-    - Qwen2-VL-based
-    - T + I
-    - :code:`MrLight/dse-qwen2-2b-mrl-v1`
-    -
-    - ✅︎
+* - Architecture
+  - Models
+  - Inputs
+  - Example HF Models
+  - [LoRA](#lora-adapter)
+  - [PP](#distributed-serving)
+* - `LlavaNextForConditionalGeneration`
+  - LLaVA-NeXT-based
+  - T / I
+  - `royokong/e5-v`
+  -
+  - ✅︎
+* - `Phi3VForCausalLM`
+  - Phi-3-Vision-based
+  - T + I
+  - `TIGER-Lab/VLM2Vec-Full`
+  - 🚧
+  - ✅︎
+* - `Qwen2VLForConditionalGeneration`
+  - Qwen2-VL-based
+  - T + I
+  - `MrLight/dse-qwen2-2b-mrl-v1`
+  -
+  - ✅︎
 ```
 
-______________________________________________________________________
+_________________
 
 # Model Support Policy
 
diff --git a/docs/source/quantization/supported_hardware.md b/docs/source/quantization/supported_hardware.md
index 843ee21627d78..7330c2f8aa194 100644
--- a/docs/source/quantization/supported_hardware.md
+++ b/docs/source/quantization/supported_hardware.md
@@ -4,121 +4,120 @@
 
 The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
 
-```{eval-rst}
-.. list-table::
-   :header-rows: 1
-   :widths: 20 8 8 8 8 8 8 8 8 8 8
+```{list-table}
+:header-rows: 1
+:widths: 20 8 8 8 8 8 8 8 8 8 8
 
-   * - Implementation
-     - Volta
-     - Turing
-     - Ampere
-     - Ada
-     - Hopper
-     - AMD GPU
-     - Intel GPU
-     - x86 CPU
-     - AWS Inferentia
-     - Google TPU
-   * - AWQ
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-   * - GPTQ
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-   * - Marlin (GPTQ/AWQ/FP8)
-     - ✗
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-   * - INT8 (W8A8)
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✅︎
-     - ✗
-     - ✗
-   * - FP8 (W8A8)
-     - ✗
-     - ✗
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-   * - AQLM
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-   * - bitsandbytes
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-   * - DeepSpeedFP
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-   * - GGUF
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
+* - Implementation
+  - Volta
+  - Turing
+  - Ampere
+  - Ada
+  - Hopper
+  - AMD GPU
+  - Intel GPU
+  - x86 CPU
+  - AWS Inferentia
+  - Google TPU
+* - AWQ
+  - ✗
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+* - GPTQ
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+* - Marlin (GPTQ/AWQ/FP8)
+  - ✗
+  - ✗
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+* - INT8 (W8A8)
+  - ✗
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✅︎
+  - ✗
+  - ✗
+* - FP8 (W8A8)
+  - ✗
+  - ✗
+  - ✗
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+* - AQLM
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+* - bitsandbytes
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+* - DeepSpeedFP
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+* - GGUF
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+  - ✗
 ```
 
 ## Notes:
diff --git a/docs/source/serving/deploying_with_helm.md b/docs/source/serving/deploying_with_helm.md
index 3b26575827011..7286a0a88968f 100644
--- a/docs/source/serving/deploying_with_helm.md
+++ b/docs/source/serving/deploying_with_helm.md
@@ -43,209 +43,208 @@ chart **including persistent volumes** and deletes the release.
 
 ## Values
 
-```{eval-rst}
-.. list-table:: Values
-   :widths: 25 25 25 25
-   :header-rows: 1
+```{list-table}
+:widths: 25 25 25 25
+:header-rows: 1
 
-   * - Key
-     - Type
-     - Default
-     - Description
-   * - autoscaling
-     - object
-     - {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}
-     - Autoscaling configuration
-   * - autoscaling.enabled
-     - bool
-     - false
-     - Enable autoscaling
-   * - autoscaling.maxReplicas
-     - int
-     - 100
-     - Maximum replicas
-   * - autoscaling.minReplicas
-     - int
-     - 1
-     - Minimum replicas
-   * - autoscaling.targetCPUUtilizationPercentage
-     - int
-     - 80
-     - Target CPU utilization for autoscaling
-   * - configs
-     - object
-     - {}
-     - Configmap
-   * - containerPort
-     - int
-     - 8000
-     - Container port
-   * - customObjects
-     - list
-     - []
-     - Custom Objects configuration
-   * - deploymentStrategy
-     - object
-     - {}
-     - Deployment strategy configuration
-   * - externalConfigs
-     - list
-     - []
-     - External configuration
-   * - extraContainers
-     - list
-     - []
-     - Additional containers configuration
-   * - extraInit
-     - object
-     - {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}
-     - Additional configuration for the init container
-   * - extraInit.pvcStorage
-     - string
-     - "50Gi"
-     - Storage size of the s3
-   * - extraInit.s3modelpath
-     - string
-     - "relative_s3_model_path/opt-125m"
-     - Path of the model on the s3 which hosts model weights and config files
-   * - extraInit.awsEc2MetadataDisabled
-     - boolean
-     - true
-     - Disables the use of the Amazon EC2 instance metadata service
-   * - extraPorts
-     - list
-     - []
-     - Additional ports configuration
-   * - gpuModels
-     - list
-     - ["TYPE_GPU_USED"]
-     - Type of gpu used
-   * - image
-     - object
-     - {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"}
-     - Image configuration
-   * - image.command
-     - list
-     - ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]
-     - Container launch command
-   * - image.repository
-     - string
-     - "vllm/vllm-openai"
-     - Image repository
-   * - image.tag
-     - string
-     - "latest"
-     - Image tag
-   * - livenessProbe
-     - object
-     - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}
-     - Liveness probe configuration
-   * - livenessProbe.failureThreshold
-     - int
-     - 3
-     - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
-   * - livenessProbe.httpGet
-     - object
-     - {"path":"/health","port":8000}
-     - Configuration of the Kubelet http request on the server
-   * - livenessProbe.httpGet.path
-     - string
-     - "/health"
-     - Path to access on the HTTP server
-   * - livenessProbe.httpGet.port
-     - int
-     - 8000
-     - Name or number of the port to access on the container, on which the server is listening
-   * - livenessProbe.initialDelaySeconds
-     - int
-     - 15
-     - Number of seconds after the container has started before liveness probe is initiated
-   * - livenessProbe.periodSeconds
-     - int
-     - 10
-     - How often (in seconds) to perform the liveness probe
-   * - maxUnavailablePodDisruptionBudget
-     - string
-     - ""
-     - Disruption Budget Configuration
-   * - readinessProbe
-     - object
-     - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}
-     - Readiness probe configuration
-   * - readinessProbe.failureThreshold
-     - int
-     - 3
-     - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
-   * - readinessProbe.httpGet
-     - object
-     - {"path":"/health","port":8000}
-     - Configuration of the Kubelet http request on the server
-   * - readinessProbe.httpGet.path
-     - string
-     - "/health"
-     - Path to access on the HTTP server
-   * - readinessProbe.httpGet.port
-     - int
-     - 8000
-     - Name or number of the port to access on the container, on which the server is listening
-   * - readinessProbe.initialDelaySeconds
-     - int
-     - 5
-     - Number of seconds after the container has started before readiness probe is initiated
-   * - readinessProbe.periodSeconds
-     - int
-     - 5
-     - How often (in seconds) to perform the readiness probe
-   * - replicaCount
-     - int
-     - 1
-     - Number of replicas
-   * - resources
-     - object
-     - {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}
-     - Resource configuration
-   * - resources.limits."nvidia.com/gpu"
-     - int
-     - 1
-     - Number of gpus used
-   * - resources.limits.cpu
-     - int
-     - 4
-     - Number of CPUs
-   * - resources.limits.memory
-     - string
-     - "16Gi"
-     - CPU memory configuration
-   * - resources.requests."nvidia.com/gpu"
-     - int
-     - 1
-     - Number of gpus used
-   * - resources.requests.cpu
-     - int
-     - 4
-     - Number of CPUs
-   * - resources.requests.memory
-     - string
-     - "16Gi"
-     - CPU memory configuration
-   * - secrets
-     - object
-     - {}
-     - Secrets configuration
-   * - serviceName
-     - string
-     -
-     - Service name
-   * - servicePort
-     - int
-     - 80
-     - Service port
-   * - labels.environment
-     - string
-     - test
-     - Environment name
-   * - labels.release
-     - string
-     - test
-     - Release name
+* - Key
+  - Type
+  - Default
+  - Description
+* - autoscaling
+  - object
+  - {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}
+  - Autoscaling configuration
+* - autoscaling.enabled
+  - bool
+  - false
+  - Enable autoscaling
+* - autoscaling.maxReplicas
+  - int
+  - 100
+  - Maximum replicas
+* - autoscaling.minReplicas
+  - int
+  - 1
+  - Minimum replicas
+* - autoscaling.targetCPUUtilizationPercentage
+  - int
+  - 80
+  - Target CPU utilization for autoscaling
+* - configs
+  - object
+  - {}
+  - Configmap
+* - containerPort
+  - int
+  - 8000
+  - Container port
+* - customObjects
+  - list
+  - []
+  - Custom Objects configuration
+* - deploymentStrategy
+  - object
+  - {}
+  - Deployment strategy configuration
+* - externalConfigs
+  - list
+  - []
+  - External configuration
+* - extraContainers
+  - list
+  - []
+  - Additional containers configuration
+* - extraInit
+  - object
+  - {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}
+  - Additional configuration for the init container
+* - extraInit.pvcStorage
+  - string
+  - "50Gi"
+  - Storage size of the s3
+* - extraInit.s3modelpath
+  - string
+  - "relative_s3_model_path/opt-125m"
+  - Path of the model on the s3 which hosts model weights and config files
+* - extraInit.awsEc2MetadataDisabled
+  - boolean
+  - true
+  - Disables the use of the Amazon EC2 instance metadata service
+* - extraPorts
+  - list
+  - []
+  - Additional ports configuration
+* - gpuModels
+  - list
+  - ["TYPE_GPU_USED"]
+  - Type of gpu used
+* - image
+  - object
+  - {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"}
+  - Image configuration
+* - image.command
+  - list
+  - ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]
+  - Container launch command
+* - image.repository
+  - string
+  - "vllm/vllm-openai"
+  - Image repository
+* - image.tag
+  - string
+  - "latest"
+  - Image tag
+* - livenessProbe
+  - object
+  - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}
+  - Liveness probe configuration
+* - livenessProbe.failureThreshold
+  - int
+  - 3
+  - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
+* - livenessProbe.httpGet
+  - object
+  - {"path":"/health","port":8000}
+  - Configuration of the Kubelet http request on the server
+* - livenessProbe.httpGet.path
+  - string
+  - "/health"
+  - Path to access on the HTTP server
+* - livenessProbe.httpGet.port
+  - int
+  - 8000
+  - Name or number of the port to access on the container, on which the server is listening
+* - livenessProbe.initialDelaySeconds
+  - int
+  - 15
+  - Number of seconds after the container has started before liveness probe is initiated
+* - livenessProbe.periodSeconds
+  - int
+  - 10
+  - How often (in seconds) to perform the liveness probe
+* - maxUnavailablePodDisruptionBudget
+  - string
+  - ""
+  - Disruption Budget Configuration
+* - readinessProbe
+  - object
+  - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}
+  - Readiness probe configuration
+* - readinessProbe.failureThreshold
+  - int
+  - 3
+  - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
+* - readinessProbe.httpGet
+  - object
+  - {"path":"/health","port":8000}
+  - Configuration of the Kubelet http request on the server
+* - readinessProbe.httpGet.path
+  - string
+  - "/health"
+  - Path to access on the HTTP server
+* - readinessProbe.httpGet.port
+  - int
+  - 8000
+  - Name or number of the port to access on the container, on which the server is listening
+* - readinessProbe.initialDelaySeconds
+  - int
+  - 5
+  - Number of seconds after the container has started before readiness probe is initiated
+* - readinessProbe.periodSeconds
+  - int
+  - 5
+  - How often (in seconds) to perform the readiness probe
+* - replicaCount
+  - int
+  - 1
+  - Number of replicas
+* - resources
+  - object
+  - {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}
+  - Resource configuration
+* - resources.limits."nvidia.com/gpu"
+  - int
+  - 1
+  - Number of gpus used
+* - resources.limits.cpu
+  - int
+  - 4
+  - Number of CPUs
+* - resources.limits.memory
+  - string
+  - "16Gi"
+  - CPU memory configuration
+* - resources.requests."nvidia.com/gpu"
+  - int
+  - 1
+  - Number of gpus used
+* - resources.requests.cpu
+  - int
+  - 4
+  - Number of CPUs
+* - resources.requests.memory
+  - string
+  - "16Gi"
+  - CPU memory configuration
+* - secrets
+  - object
+  - {}
+  - Secrets configuration
+* - serviceName
+  - string
+  -
+  - Service name
+* - servicePort
+  - int
+  - 80
+  - Service port
+* - labels.environment
+  - string
+  - test
+  - Environment name
+* - labels.release
+  - string
+  - test
+  - Release name
 ```

From dba4d9dec606da028fbb28240e99cabd5a761e6a Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 29 Dec 2024 17:03:49 +0800
Subject: [PATCH 25/48] [v1][bugfix] fix cudagraph with inplace buffer
 assignment (#11596)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/wrapper.py                    | 10 +++++++++-
 vllm/model_executor/layers/rotary_embedding.py | 11 +----------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index c10241b483169..e3260a10c02ae 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -28,11 +28,12 @@ class TorchCompileWrapperWithCustomDispatcher:
                  compiled_callable: Optional[Callable] = None,
                  compilation_level: int = 0):
 
+        vllm_config = get_current_vllm_config()
+        self.vllm_config = vllm_config
         if compiled_callable is None:
             # default compilation settings
             # compiling the forward method
 
-            vllm_config = get_current_vllm_config()
             backend = vllm_config.compilation_config.init_backend(vllm_config)
 
             compiled_callable = torch.compile(
@@ -82,6 +83,13 @@ class TorchCompileWrapperWithCustomDispatcher:
 
         self.compiled_codes.append(new_code)
 
+        if self.vllm_config.compilation_config.use_cudagraph and \
+            "update" in new_code.co_names:
+            import depyf
+            src = depyf.decompile(new_code)
+            msg = "Assigning / modifying buffers of nn.Module during forward pass is not allowed when using cudagraph inside the compiler because it will cause silent errors. Please use eager mode or fix the code. The following code contains clues about which buffer is being modified (please search for the usage of the function `update`):\n" + src  # noqa
+            raise RuntimeError(msg)
+
     @contextmanager
     def dispatch_to_code(self, index: int):
         """Context manager to dispatch to the compiled code.
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 117fe086e5e87..6695d44dfa32b 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -541,19 +541,12 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
         short_cache = self._compute_cos_sin_cache(
             original_max_position_embeddings, short_factor, short_mscale)
         short_cache = short_cache.to(dtype)
-        self.register_buffer("short_cos_sin_cache",
-                             short_cache,
-                             persistent=False)
 
         long_cache = self._compute_cos_sin_cache(max_position_embeddings,
                                                  long_factor, long_mscale)
         long_cache = long_cache.to(dtype)
-        self.register_buffer("long_cos_sin_cache",
-                             long_cache,
-                             persistent=False)
 
-        long_short_cache = torch.cat(
-            [self.short_cos_sin_cache, self.long_cos_sin_cache], dim=0)
+        long_short_cache = torch.cat([short_cache, long_cache], dim=0)
         self.register_buffer("long_short_cos_sin_cache",
                              long_short_cache,
                              persistent=False)
@@ -593,8 +586,6 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
                               torch.full_like(positions, k)).long()
         idx = (torch.add(positions, long_prompt_offset)
                if long_prompt_offset is not None else positions)
-        self.long_short_cos_sin_cache: torch.Tensor = (
-            self.long_short_cos_sin_cache.to(idx.device))
         idx = torch.add(idx, offsets) if offsets is not None else idx
         cos_sin = torch.index_select(self.long_short_cos_sin_cache, 0, idx)
 

From faef77c0d69c5429182f475a57127676e6bcb230 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sun, 29 Dec 2024 10:08:09 -0600
Subject: [PATCH 26/48] [Misc] KV cache transfer connector registry (#11481)

Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
---
 vllm/config.py                                |  8 ----
 .../kv_transfer/kv_connector/factory.py       | 48 +++++++++++++++----
 2 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 6ae1d4d944447..8e556743c8528 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2559,14 +2559,6 @@ class KVTransferConfig(BaseModel):
         return KVTransferConfig.model_validate_json(cli_value)
 
     def model_post_init(self, __context: Any) -> None:
-        supported_kv_connector = ["PyNcclConnector", "MooncakeConnector"]
-        if all([
-                self.kv_connector is not None, self.kv_connector
-                not in supported_kv_connector
-        ]):
-            raise ValueError(f"Unsupported kv_connector: {self.kv_connector}. "
-                             f"Supported connectors are "
-                             f"{supported_kv_connector}.")
 
         if self.kv_role is not None and self.kv_role not in [
                 "kv_producer", "kv_consumer", "kv_both"
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index 3e2bb436d24b5..6372dab726086 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -1,4 +1,5 @@
-from typing import TYPE_CHECKING
+import importlib
+from typing import TYPE_CHECKING, Callable, Dict, Type
 
 from .base import KVConnectorBase
 
@@ -7,14 +8,41 @@ if TYPE_CHECKING:
 
 
 class KVConnectorFactory:
+    _registry: Dict[str, Callable[[], Type[KVConnectorBase]]] = {}
 
-    @staticmethod
-    def create_connector(rank: int, local_rank: int,
+    @classmethod
+    def register_connector(cls, name: str, module_path: str,
+                           class_name: str) -> None:
+        """Register a connector with a lazy-loading module and class name."""
+        if name in cls._registry:
+            raise ValueError(f"Connector '{name}' is already registered.")
+
+        def loader() -> Type[KVConnectorBase]:
+            module = importlib.import_module(module_path)
+            return getattr(module, class_name)
+
+        cls._registry[name] = loader
+
+    @classmethod
+    def create_connector(cls, rank: int, local_rank: int,
                          config: "VllmConfig") -> KVConnectorBase:
-        supported_kv_connector = ["PyNcclConnector", "MooncakeConnector"]
-        if config.kv_transfer_config.kv_connector in supported_kv_connector:
-            from .simple_connector import SimpleConnector
-            return SimpleConnector(rank, local_rank, config)
-        else:
-            raise ValueError(f"Unsupported connector type: "
-                             f"{config.kv_connector}")
+        connector_name = config.kv_transfer_config.kv_connector
+        if connector_name not in cls._registry:
+            raise ValueError(f"Unsupported connector type: {connector_name}")
+
+        connector_cls = cls._registry[connector_name]()
+        return connector_cls(rank, local_rank, config)
+
+
+# Register various connectors here.
+# The registration should not be done in each individual file, as we want to
+# only load the files corresponding to the current connector.
+KVConnectorFactory.register_connector(
+    "PyNcclConnector",
+    "vllm.distributed.kv_transfer.kv_connector.simple_connector",
+    "SimpleConnector")
+
+KVConnectorFactory.register_connector(
+    "MooncakeConnector",
+    "vllm.distributed.kv_transfer.kv_connector.simple_connector",
+    "SimpleConnector")

From 0aa38d16f56327622c1689d7510171662757deee Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Sun, 29 Dec 2024 15:16:46 -0500
Subject: [PATCH 27/48] Remove print statement in
 DeepseekScalingRotaryEmbedding (#11604)

---
 vllm/model_executor/layers/rotary_embedding.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 6695d44dfa32b..3fcd81a3c4213 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -668,7 +668,6 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
         cos = (freqs.cos() * self.mscale)
         sin = (freqs.sin() * self.mscale)
         cache = torch.cat((cos, sin), dim=-1)
-        print("Cache shape", cache.shape)
         return cache
 
     def forward(

From 3682e33f9ff9d8baade6112a8e75a77da898f504 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 30 Dec 2024 12:24:12 +0800
Subject: [PATCH 28/48] [v1] fix compilation cache (#11598)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/compile/piecewise/test_toy_llama.py | 15 +++++++-
 vllm/compilation/backends.py              | 22 ++++++-----
 vllm/config.py                            | 45 +++++++++++++++++++++--
 vllm/v1/worker/gpu_worker.py              |  1 +
 4 files changed, 69 insertions(+), 14 deletions(-)

diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index 07c10a3a18c55..d4ede4d2320a7 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -7,7 +7,7 @@ if the config `tractable_init` is set to True. Otherwise, the weights are
 initialized randomly with a fixed seed.
 """
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import Any, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -54,6 +54,16 @@ class LlamaConfig:
     tractable_init: bool = False
     random_seed: int = 0
 
+    def compute_hash(self) -> str:
+        factors: List[Any] = []
+        for k, v in self.__dict__.items():
+            if k == "random_seed":
+                continue
+            factors.append((k, v))
+        factors.sort()
+        import hashlib
+        return hashlib.md5(str(factors).encode()).hexdigest()
+
     def __post_init__(self):
         assert self.mlp_size >= self.hidden_size
 
@@ -263,7 +273,8 @@ def run_model(llama_config,
         compilation_config = CompilationConfig(
             level=CompilationLevel.NO_COMPILATION, )
 
-    vllm_config = VllmConfig(compilation_config=compilation_config)
+    vllm_config = VllmConfig(compilation_config=compilation_config,
+                             additional_config=llama_config)
     with set_current_vllm_config(vllm_config):
         model = LlamaModel(config=llama_config,
                            vllm_config=vllm_config,
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 4f960b441f21d..a8dd628b9cd6f 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -619,8 +619,10 @@ class PiecewiseBackend:
         # the entries for different shapes that we need to either
         # compile or capture cudagraph
         self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {}
-        self.to_be_compiled_sizes: Set[int] = self.compile_sizes.union(
-            self.capture_sizes)
+
+        # to_be_compiled_sizes tracks the remaining sizes to compile,
+        # and updates during the compilation process, so we need to copy it
+        self.to_be_compiled_sizes: Set[int] = self.compile_sizes.copy()
         for shape in self.compile_sizes.union(self.capture_sizes):
             self.concrete_size_entries[shape] = ConcreteSizeEntry(
                 runtime_shape=shape,
@@ -628,12 +630,17 @@ class PiecewiseBackend:
                 use_cudagraph=shape in self.capture_sizes,
             )
 
+    def check_for_ending_compilation(self):
+        if self.is_last_graph and not self.to_be_compiled_sizes:
+            # no specific sizes to compile
+            # save the hash of the inductor graph for the next run
+            self.compilation_config.inductor_hash_cache.save_to_file()
+            end_monitoring_torch_compile(self.vllm_config)
+
     def __call__(self, *args) -> Any:
         if not self.first_run_finished:
             self.first_run_finished = True
-            # no specific sizes to compile
-            if self.is_last_graph and not self.to_be_compiled_sizes:
-                end_monitoring_torch_compile(self.vllm_config)
+            self.check_for_ending_compilation()
             return self.compiled_graph_for_general_shape(*args)
 
         runtime_shape = args[self.sym_shape_indices[0]]
@@ -662,10 +669,7 @@ class PiecewiseBackend:
 
             # finished compilations for all required shapes
             if self.is_last_graph and not self.to_be_compiled_sizes:
-
-                # save the hash of the inductor graph for the next run
-                self.compilation_config.inductor_hash_cache.save_to_file()
-                end_monitoring_torch_compile(self.vllm_config)
+                self.check_for_ending_compilation()
 
         if not entry.use_cudagraph:
             return entry.runnable(*args)
diff --git a/vllm/config.py b/vllm/config.py
index 8e556743c8528..765a46e6aeee3 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -9,8 +9,8 @@ from contextlib import contextmanager
 from dataclasses import dataclass, field, replace
 from pathlib import Path
 from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Counter, Dict,
-                    Final, List, Literal, Mapping, Optional, Set, Tuple, Type,
-                    Union)
+                    Final, List, Literal, Mapping, Optional, Protocol, Set,
+                    Tuple, Type, Union)
 
 import torch
 from pydantic import BaseModel, Field, PrivateAttr
@@ -75,6 +75,12 @@ HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig],
                                              PretrainedConfig]]
 
 
+class SupportsHash(Protocol):
+
+    def compute_hash(self) -> str:
+        ...
+
+
 class ModelConfig:
     """Configuration for the model.
 
@@ -2969,6 +2975,10 @@ class VllmConfig:
                                                   init=True)  # type: ignore
     kv_transfer_config: KVTransferConfig = field(default=None,
                                                  init=True)  # type: ignore
+    # some opaque config, only used to provide additional information
+    # for the hash computation, mainly used for testing and debugging.
+    additional_config: SupportsHash = field(default=None,
+                                            init=True)  # type: ignore
     instance_id: str = ""
 
     def compute_hash(self) -> str:
@@ -3000,33 +3010,62 @@ class VllmConfig:
         vllm_factors.append(__version__)
         if self.model_config:
             vllm_factors.append(self.model_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.cache_config:
             vllm_factors.append(self.cache_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.parallel_config:
             vllm_factors.append(self.parallel_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.scheduler_config:
             vllm_factors.append(self.scheduler_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.device_config:
             vllm_factors.append(self.device_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.load_config:
             vllm_factors.append(self.load_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.lora_config:
             vllm_factors.append(self.lora_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.speculative_config:
             vllm_factors.append(self.speculative_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.decoding_config:
             vllm_factors.append(self.decoding_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.observability_config:
             vllm_factors.append(self.observability_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.prompt_adapter_config:
             vllm_factors.append(self.prompt_adapter_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.quant_config:
             pass  # should be captured by model_config.quantization
         if self.compilation_config:
             vllm_factors.append(self.compilation_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.kv_transfer_config:
             vllm_factors.append(self.kv_transfer_config.compute_hash())
-
+        else:
+            vllm_factors.append("None")
+        if self.additional_config:
+            vllm_factors.append(self.additional_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         factors.append(vllm_factors)
 
         hash_str = hashlib.md5(str(factors).encode()).hexdigest()[:10]
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 0000b09bfaa36..af438f7d5820c 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -48,6 +48,7 @@ class Worker:
         self.prompt_adapter_config = vllm_config.prompt_adapter_config
         self.observability_config = vllm_config.observability_config
 
+        self.parallel_config.rank = rank
         self.local_rank = local_rank
         self.rank = rank
         self.distributed_init_method = distributed_init_method

From 628ec6c17b8121517e8f303b64567573036cdb38 Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfc@amazon.com>
Date: Sun, 29 Dec 2024 21:46:14 -0800
Subject: [PATCH 29/48] [Docker] bump up neuron sdk v2.21 (#11593)

Signed-off-by: Liangfu Chen <liangfc@amazon.com>
---
 Dockerfile.neuron              | 6 +++---
 requirements-neuron.txt        | 4 ++--
 vllm/_custom_ops.py            | 3 +--
 vllm/triton_utils/importing.py | 1 -
 4 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 77162bc82de62..269139fe90f0b 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -1,6 +1,6 @@
 # default base image
 # https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.2-ubuntu20.04"
+ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"
 
 FROM $BASE_IMAGE
 
@@ -22,9 +22,9 @@ WORKDIR ${APP_MOUNT}/vllm
 
 RUN python3 -m pip install --upgrade pip
 RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
-RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
+RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
 RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
-RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 
 COPY . .
 ARG GIT_REPO_CHECK=0
diff --git a/requirements-neuron.txt b/requirements-neuron.txt
index 148fdbe0d6310..5e08d101fcd61 100644
--- a/requirements-neuron.txt
+++ b/requirements-neuron.txt
@@ -2,6 +2,6 @@
 -r requirements-common.txt
 
 # Dependencies for Neuron devices
-transformers-neuronx >= 0.12.0
-torch-neuronx >= 2.1.2
+transformers-neuronx >= 0.13.0
+torch-neuronx >= 2.5.0
 neuronx-cc
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index aeacf5dda5761..eb2f69df42624 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -23,8 +23,7 @@ with contextlib.suppress(ImportError):
     import vllm._moe_C  # noqa: F401
     supports_moe_ops = True
 
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING or current_platform.is_neuron():
+if TYPE_CHECKING:
 
     def register_fake(fn):
         return lambda name: fn
diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py
index 36315abcdfcda..0c96e0632f646 100644
--- a/vllm/triton_utils/importing.py
+++ b/vllm/triton_utils/importing.py
@@ -8,7 +8,6 @@ logger = init_logger(__name__)
 HAS_TRITON = (
     find_spec("triton") is not None
     and not current_platform.is_xpu()  # Not compatible
-    and not current_platform.is_neuron()  # neuron has too old torch
 )
 
 if not HAS_TRITON:

From 970d6d0776076f17604077ba4d484cdadd604ceb Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 30 Dec 2024 04:22:13 -0500
Subject: [PATCH 30/48] [Build][Kernel] Update CUTLASS to v3.6.0 (#11607)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 CMakeLists.txt                                 |  4 ++--
 .../vllm_cutlass_library_extension.py          | 18 +++++++++---------
 csrc/quantization/machete/generate.py          |  8 ++++----
 .../machete/machete_collective_builder.cuh     | 10 ++++------
 csrc/quantization/machete/machete_mainloop.cuh | 11 ++++-------
 .../machete/machete_prepacked_layout.cuh       |  5 ++---
 6 files changed, 25 insertions(+), 31 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 83c8033434f3b..3206d76125545 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -223,13 +223,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     FetchContent_Declare(
         cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        GIT_TAG 8aa95dbb888be6d81c6fbf7169718c5244b53227
+        GIT_TAG v3.6.0
         GIT_PROGRESS TRUE
 
         # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
         # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
         # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
-        GIT_SHALLOW FALSE
+        GIT_SHALLOW TRUE
     )
   endif()
   FetchContent_MakeAvailable(cutlass)
diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
index a5beea1a35e49..b401736c9824b 100644
--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@@ -14,9 +14,9 @@ class VLLMDataType(enum.Enum):
 
 
 class MixedInputKernelScheduleType(enum.Enum):
-    TmaWarpSpecializedMixedInput = enum_auto()
-    TmaWarpSpecializedPingpongMixedInput = enum_auto()
-    TmaWarpSpecializedCooperativeMixedInput = enum_auto()
+    TmaWarpSpecialized = enum_auto()
+    TmaWarpSpecializedPingpong = enum_auto()
+    TmaWarpSpecializedCooperative = enum_auto()
 
 
 VLLMDataTypeNames: Dict[Union[VLLMDataType, DataType], str] = {
@@ -68,11 +68,11 @@ VLLMKernelScheduleTag: Dict[Union[
     MixedInputKernelScheduleType, KernelScheduleType], str] = {
         **KernelScheduleTag,  # type: ignore
         **{
-            MixedInputKernelScheduleType.TmaWarpSpecializedMixedInput:
-            "cutlass::gemm::KernelTmaWarpSpecializedMixedInput",
-            MixedInputKernelScheduleType.TmaWarpSpecializedPingpongMixedInput:
-            "cutlass::gemm::KernelTmaWarpSpecializedPingpongMixedInput",
-            MixedInputKernelScheduleType.TmaWarpSpecializedCooperativeMixedInput:
-            "cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput",
+            MixedInputKernelScheduleType.TmaWarpSpecialized:
+            "cutlass::gemm::KernelTmaWarpSpecialized",
+            MixedInputKernelScheduleType.TmaWarpSpecializedPingpong:
+            "cutlass::gemm::KernelTmaWarpSpecializedPingpong",
+            MixedInputKernelScheduleType.TmaWarpSpecializedCooperative:
+            "cutlass::gemm::KernelTmaWarpSpecializedCooperative",
         }
     }
diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
index ac63afe79a255..2df4d181902f8 100644
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -189,7 +189,7 @@ using Kernel_{{type_sig}} = MacheteKernelTemplate<
   {{DataTypeTag[t.b_group_zeropoint]}}, // GroupZeroT
   {{DataTypeTag[t.b_channel_scale]}}, // ChannelScaleT
   {{DataTypeTag[t.a_token_scale]}}, // TokenScaleT
-  cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput,
+  cutlass::gemm::KernelTmaWarpSpecializedCooperative,
   Sch>;
 
 {% for sch in schs %}
@@ -223,7 +223,7 @@ torch::Tensor prepack_B_dispatch(PrepackBArgs args) {
         {{DataTypeTag[t.convert]}}, // ElementConvert
         {{DataTypeTag[t.accumulator]}}, // Accumulator
         cutlass::layout::ColumnMajor,
-        cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput>
+        cutlass::gemm::KernelTmaWarpSpecializedCooperative>
     >(args.B); 
   }
   {%- endfor %}
@@ -239,7 +239,7 @@ torch::Tensor prepack_B_dispatch(PrepackBArgs args) {
 }; // namespace machete
 """
 
-TmaMI = MixedInputKernelScheduleType.TmaWarpSpecializedCooperativeMixedInput
+TmaMI = MixedInputKernelScheduleType.TmaWarpSpecializedCooperative
 TmaCoop = EpilogueScheduleType.TmaWarpSpecializedCooperative
 
 
@@ -300,7 +300,7 @@ def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
 # mostly unique shorter sch_sig
 def generate_terse_sch_sig(schedule_config: ScheduleConfig) -> str:
     kernel_terse_names_replace = {
-        "KernelTmaWarpSpecializedCooperativeMixedInput_": "TmaMI_",
+        "KernelTmaWarpSpecializedCooperative": "TmaMI_",
         "TmaWarpSpecializedCooperative_": "TmaCoop_",
         "StreamKScheduler": "streamK",
     }
diff --git a/csrc/quantization/machete/machete_collective_builder.cuh b/csrc/quantization/machete/machete_collective_builder.cuh
index a74cf8b2dd455..ee825583dee1a 100644
--- a/csrc/quantization/machete/machete_collective_builder.cuh
+++ b/csrc/quantization/machete/machete_collective_builder.cuh
@@ -18,16 +18,14 @@ struct VLLMCollectiveBuilder<
     ElementAccumulator, TileShape_MNK, ClusterShape_MNK, StageCountType,
     KernelScheduleType,
     cute::enable_if_t<(
+        cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized> ||
+        cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpong> ||
         cute::is_same_v<KernelScheduleType,
-                        KernelTmaWarpSpecializedMixedInput> ||
-        cute::is_same_v<KernelScheduleType,
-                        KernelTmaWarpSpecializedPingpongMixedInput> ||
-        cute::is_same_v<KernelScheduleType,
-                        KernelTmaWarpSpecializedCooperativeMixedInput>)>> {
+                        KernelTmaWarpSpecializedCooperative>)>> {
   using CollectiveOp = machete::MacheteCollectiveMma<
       ElementPairA_, GmemLayoutA_, AlignmentA, ElementPairB_, GmemLayoutB_,
       AlignmentB, ElementAccumulator, TileShape_MNK, ClusterShape_MNK,
       StageCountType, KernelScheduleType>;
 };
 
-};  // namespace cutlass::gemm::collective
\ No newline at end of file
+};  // namespace cutlass::gemm::collective
diff --git a/csrc/quantization/machete/machete_mainloop.cuh b/csrc/quantization/machete/machete_mainloop.cuh
index 816f33a1078e5..4071b19a3564d 100644
--- a/csrc/quantization/machete/machete_mainloop.cuh
+++ b/csrc/quantization/machete/machete_mainloop.cuh
@@ -66,13 +66,11 @@ struct MacheteCollectiveMma {
   using Schedule = KernelScheduleType;
   static_assert(
       cute::is_same_v<Schedule, KernelTmaWarpSpecialized> ||
-          cute::is_same_v<Schedule, KernelTmaWarpSpecializedMixedInput> ||
+          cute::is_same_v<Schedule, KernelTmaWarpSpecialized> ||
+          cute::is_same_v<Schedule, KernelTmaWarpSpecializedPingpong> ||
           cute::is_same_v<Schedule, KernelTmaWarpSpecializedPingpong> ||
-          cute::is_same_v<Schedule,
-                          KernelTmaWarpSpecializedPingpongMixedInput> ||
           cute::is_same_v<Schedule, KernelTmaWarpSpecializedCooperative> ||
-          cute::is_same_v<Schedule,
-                          KernelTmaWarpSpecializedCooperativeMixedInput>,
+          cute::is_same_v<Schedule, KernelTmaWarpSpecializedCooperative>,
       "KernelSchedule must be one of the warp specialized policies");
 
  public:
@@ -113,8 +111,7 @@ struct MacheteCollectiveMma {
   // For coop schedules we have two warp groups cooperatively issuing wgmma
   // instructions so we use 2 atoms along the M dim (one for each warpgroup)
   using AtomLayoutMNK = cute::conditional_t<
-      cute::is_same_v<KernelScheduleType,
-                      KernelTmaWarpSpecializedCooperativeMixedInput>,
+      cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative>,
       Layout<Shape<_2, _1, _1>>, Layout<Shape<_1, _1, _1>>>;
 
   using TiledMma = decltype(cute::make_tiled_mma(
diff --git a/csrc/quantization/machete/machete_prepacked_layout.cuh b/csrc/quantization/machete/machete_prepacked_layout.cuh
index 680a858a893c1..81aaa6c4f3a28 100644
--- a/csrc/quantization/machete/machete_prepacked_layout.cuh
+++ b/csrc/quantization/machete/machete_prepacked_layout.cuh
@@ -98,8 +98,7 @@ struct PrepackedLayoutBTemplate {
   // For coop schedules we have two warp groups cooperatively issuing wgmma
   // instructions so we use 2 atoms along the M dim (one for each warpgroup)
   using AtomLayoutMNK = cute::conditional_t<
-      cute::is_same_v<KernelSchedule,
-                      KernelTmaWarpSpecializedCooperativeMixedInput>,
+      cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedCooperative>,
       Layout<Shape<_2, _1, _1>>, Layout<Shape<_1, _1, _1>>>;
 
   using TiledMma = decltype(cute::make_tiled_mma(
@@ -247,4 +246,4 @@ struct PrepackedLayoutBTemplate {
   }
 };
 
-};  // namespace machete
\ No newline at end of file
+};  // namespace machete

From 5dbf854553cb6ac97f0c633ed36ba64e0fc9bb29 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Mon, 30 Dec 2024 18:17:04 +0800
Subject: [PATCH 31/48] [CI/Build][CPU] Fix CPU CI by lazy importing triton FP8
 kernels (#11618)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 vllm/model_executor/layers/quantization/fp8.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 7f779ac8d3b3e..2fe22903a385b 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -15,8 +15,6 @@ from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
-from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    apply_w8a8_block_fp8_linear)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
@@ -337,6 +335,9 @@ class Fp8LinearMethod(LinearMethodBase):
                 size_k=layer.input_size_per_partition,
                 bias=bias)
 
+        # Note: lazy import to avoid triton import error.
+        from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+            apply_w8a8_block_fp8_linear)
         if self.block_quant:
             assert self.quant_config.weight_block_size is not None
             return apply_w8a8_block_fp8_linear(

From b12e87f942eb7740c17ab546b964bc327afdda37 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 30 Dec 2024 20:24:45 +0800
Subject: [PATCH 32/48] [platforms] enable platform plugins (#11602)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  25 +-
 docs/source/design/plugin_system.md           |   6 +-
 tests/conftest.py                             |   2 +-
 tests/kernels/test_attention_selector.py      |  16 +-
 .../plugins/vllm_add_dummy_platform/setup.py  |  11 +
 .../vllm_add_dummy_platform/__init__.py       |   5 +
 .../vllm_add_dummy_platform/dummy_platform.py |   5 +
 tests/plugins_tests/test_platform_plugins.py  |  16 +
 vllm/config.py                                |  15 +-
 vllm/distributed/parallel_state.py            |   3 +-
 vllm/engine/arg_utils.py                      |   2 +-
 vllm/executor/ray_utils.py                    |   2 +-
 .../guided_decoding/__init__.py               |   3 +-
 vllm/model_executor/models/registry.py        |   2 +-
 vllm/model_executor/utils.py                  |   4 +-
 vllm/platforms/__init__.py                    | 308 ++++++++++++------
 vllm/plugins/__init__.py                      |  72 ++--
 vllm/spec_decode/metrics.py                   |   2 +-
 vllm/usage/usage_lib.py                       |   2 +-
 vllm/utils.py                                 |   8 +-
 vllm/worker/model_runner_base.py              |   5 +-
 vllm/worker/multi_step_model_runner.py        |   1 +
 vllm/worker/worker_base.py                    |  14 +-
 23 files changed, 354 insertions(+), 175 deletions(-)
 create mode 100644 tests/plugins/vllm_add_dummy_platform/setup.py
 create mode 100644 tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
 create mode 100644 tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
 create mode 100644 tests/plugins_tests/test_platform_plugins.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index b563c96343f92..bee968b4d2e43 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -106,14 +106,12 @@ steps:
   source_file_dependencies:
   - vllm/
   commands:
-  - pip install -e ./plugins/vllm_add_dummy_model
   - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
   - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
-  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
   - pytest -v -s entrypoints/test_chat_utils.py
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
@@ -333,8 +331,6 @@ steps:
   - vllm/
   - tests/models
   commands:
-    - pip install -e ./plugins/vllm_add_dummy_model
-    - pytest -v -s models/test_oot_registration.py # it needs a clean process
     - pytest -v -s models/test_registry.py
     - pytest -v -s models/test_initialization.py
 
@@ -469,11 +465,28 @@ steps:
   - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
   - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-  - pip install -e ./plugins/vllm_add_dummy_model
-  - pytest -v -s distributed/test_distributed_oot.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
 
+- label: Plugin Tests (2 GPUs) # 40min
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  fast_check: true
+  source_file_dependencies:
+  - vllm/plugins/
+  - tests/plugins/
+  commands:
+  # begin platform plugin tests, all the code in-between runs on dummy platform
+  - pip install -e ./plugins/vllm_add_dummy_platform
+  - pytest -v -s plugins_tests/test_platform_plugins.py
+  - pip uninstall vllm_add_dummy_platform -y
+  # end platform plugin tests
+  # other tests continue here:
+  - pip install -e ./plugins/vllm_add_dummy_model
+  - pytest -v -s distributed/test_distributed_oot.py
+  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+  - pytest -v -s models/test_oot_registration.py # it needs a clean process
+
 - label: Multi-step Tests (4 GPUs) # 36min
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
diff --git a/docs/source/design/plugin_system.md b/docs/source/design/plugin_system.md
index 79aff757518f2..225030885f629 100644
--- a/docs/source/design/plugin_system.md
+++ b/docs/source/design/plugin_system.md
@@ -41,9 +41,11 @@ Every plugin has three parts:
 2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the `entry_points` dictionary. In the example above, the plugin name is `register_dummy_model`. Plugins can be filtered by their names using the `VLLM_PLUGINS` environment variable. To load only a specific plugin, set `VLLM_PLUGINS` to the plugin name.
 3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module.
 
-## What Can Plugins Do?
+## Types of supported plugins
 
-Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM.
+- **General plugins** (with group name `vllm.general_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model inside the plugin function.
+
+- **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.
 
 ## Guidelines for Writing Plugins
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 4e939221329cd..6e2f75e33654f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -31,7 +31,6 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
                          to_enc_dec_tuple_list, zip_enc_dec_prompts)
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
-from vllm.platforms import current_platform
 from vllm.sampling_params import BeamSearchParams
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
                         identity)
@@ -242,6 +241,7 @@ _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
 class HfRunner:
 
     def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
+        from vllm.platforms import current_platform
         if x is None or isinstance(x, (bool, )):
             return x
 
diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index d37f95d48d5b2..916cc2efa3895 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -5,7 +5,10 @@ import torch
 
 from tests.kernels.utils import override_backend_env_variable
 from vllm.attention.selector import which_attn_to_use
-from vllm.platforms import cpu, cuda, openvino, rocm
+from vllm.platforms.cpu import CpuPlatform
+from vllm.platforms.cuda import CudaPlatform
+from vllm.platforms.openvino import OpenVinoPlatform
+from vllm.platforms.rocm import RocmPlatform
 from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL
 
 
@@ -20,26 +23,23 @@ def test_env(name: str, device: str, monkeypatch):
     override_backend_env_variable(monkeypatch, name)
 
     if device == "cpu":
-        with patch("vllm.attention.selector.current_platform",
-                   cpu.CpuPlatform()):
+        with patch("vllm.attention.selector.current_platform", CpuPlatform()):
             backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
                                         False)
         assert backend.name == "TORCH_SDPA"
     elif device == "hip":
-        with patch("vllm.attention.selector.current_platform",
-                   rocm.RocmPlatform()):
+        with patch("vllm.attention.selector.current_platform", RocmPlatform()):
             backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
                                         False)
         assert backend.name == "ROCM_FLASH"
     elif device == "openvino":
         with patch("vllm.attention.selector.current_platform",
-                   openvino.OpenVinoPlatform()):
+                   OpenVinoPlatform()):
             backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
                                         False)
         assert backend.name == "OPENVINO"
     else:
-        with patch("vllm.attention.selector.current_platform",
-                   cuda.CudaPlatform()):
+        with patch("vllm.attention.selector.current_platform", CudaPlatform()):
             backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
                                         False)
         assert backend.name == name
diff --git a/tests/plugins/vllm_add_dummy_platform/setup.py b/tests/plugins/vllm_add_dummy_platform/setup.py
new file mode 100644
index 0000000000000..31639906898db
--- /dev/null
+++ b/tests/plugins/vllm_add_dummy_platform/setup.py
@@ -0,0 +1,11 @@
+from setuptools import setup
+
+setup(
+    name='vllm_add_dummy_platform',
+    version='0.1',
+    packages=['vllm_add_dummy_platform'],
+    entry_points={
+        'vllm.platform_plugins': [
+            "dummy_platform_plugin = vllm_add_dummy_platform:dummy_platform_plugin"  # noqa
+        ]
+    })
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
new file mode 100644
index 0000000000000..594cef520a7de
--- /dev/null
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
@@ -0,0 +1,5 @@
+from typing import Optional
+
+
+def dummy_platform_plugin() -> Optional[str]:
+    return "vllm_add_dummy_platform.dummy_platform.DummyPlatform"
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
new file mode 100644
index 0000000000000..fde93142f1103
--- /dev/null
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@@ -0,0 +1,5 @@
+from vllm.platforms.cuda import CudaPlatform
+
+
+class DummyPlatform(CudaPlatform):
+    device_name = "DummyDevice"
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
new file mode 100644
index 0000000000000..0d27cf9f152e0
--- /dev/null
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -0,0 +1,16 @@
+def test_platform_plugins():
+    # simulate workload by running an example
+    import runpy
+    current_file = __file__
+    import os
+    example_file = os.path.join(
+        os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
+        "examples", "offline_inference.py")
+    runpy.run_path(example_file)
+
+    # check if the plugin is loaded correctly
+    from vllm.platforms import _init_trace, current_platform
+    assert current_platform.device_name == "DummyDevice", (
+        f"Expected DummyDevice, got {current_platform.device_name}, "
+        "possibly because current_platform is imported before the plugin"
+        f" is loaded. The first import:\n{_init_trace}")
diff --git a/vllm/config.py b/vllm/config.py
index 765a46e6aeee3..e72c53b6130d0 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -22,7 +22,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
                                                      get_quantization_config)
 from vllm.model_executor.models import ModelRegistry
-from vllm.platforms import current_platform, interface
+from vllm.platforms import CpuArchEnum
 from vllm.tracing import is_otel_available, otel_import_error_traceback
 from vllm.transformers_utils.config import (
     ConfigFormat, get_config, get_hf_image_processor_config,
@@ -349,6 +349,7 @@ class ModelConfig:
         self.is_hybrid = self._init_is_hybrid()
         self.has_inner_state = self._init_has_inner_state()
 
+        from vllm.platforms import current_platform
         if current_platform.is_neuron():
             self.override_neuron_config = override_neuron_config
         else:
@@ -589,6 +590,7 @@ class ModelConfig:
                 raise ValueError(
                     f"Unknown quantization method: {self.quantization}. Must "
                     f"be one of {supported_quantization}.")
+            from vllm.platforms import current_platform
             current_platform.verify_quantization(self.quantization)
             if self.quantization not in optimized_quantization_methods:
                 logger.warning(
@@ -644,6 +646,7 @@ class ModelConfig:
 
         # Reminder: Please update docs/source/usage/compatibility_matrix.md
         # If the feature combo become valid
+        from vllm.platforms import current_platform
         if not current_platform.is_async_output_supported(self.enforce_eager):
             logger.warning(
                 "Async output processing is not supported on the "
@@ -1012,6 +1015,7 @@ class CacheConfig:
             raise ValueError(
                 "GPU memory utilization must be less than 1.0. Got "
                 f"{self.gpu_memory_utilization}.")
+        from vllm.platforms import current_platform
         if (current_platform.is_cuda() and self.block_size is not None
                 and self.block_size > 32):
             raise ValueError("CUDA Paged Attention kernel only supports "
@@ -1279,6 +1283,7 @@ class ParallelConfig:
                                  f"distributed executor backend "
                                  f"'{self.distributed_executor_backend}'.")
         ray_only_devices = ["tpu", "hpu"]
+        from vllm.platforms import current_platform
         if (current_platform.device_type in ray_only_devices
                 and self.world_size > 1):
             if self.distributed_executor_backend is None:
@@ -1327,7 +1332,7 @@ class ParallelConfig:
     def _verify_args(self) -> None:
         # Lazy import to avoid circular import
         from vllm.executor.executor_base import ExecutorBase
-
+        from vllm.platforms import current_platform
         if self.distributed_executor_backend not in (
                 "ray", "mp", None) and not (isinstance(
                     self.distributed_executor_backend, type) and issubclass(
@@ -1528,6 +1533,7 @@ class DeviceConfig:
     def __init__(self, device: str = "auto") -> None:
         if device == "auto":
             # Automated device type detection
+            from vllm.platforms import current_platform
             self.device_type = current_platform.device_type
             if not self.device_type:
                 raise RuntimeError("Failed to infer device type")
@@ -2241,9 +2247,10 @@ def _get_and_verify_dtype(
             else:
                 torch_dtype = config_dtype
 
+            from vllm.platforms import current_platform
             if (current_platform.is_cpu()
                     and current_platform.get_cpu_architecture()
-                    == interface.CpuArchEnum.POWERPC
+                    == CpuArchEnum.POWERPC
                     and (config_dtype == torch.float16
                          or config_dtype == torch.float32)):
                 logger.info(
@@ -3083,6 +3090,7 @@ class VllmConfig:
             model_config: ModelConfig,
             load_config: LoadConfig) -> Optional[QuantizationConfig]:
         """Get the quantization config."""
+        from vllm.platforms import current_platform
         if model_config.quantization is not None:
             from vllm.model_executor.model_loader.weight_utils import (
                 get_quant_config)
@@ -3145,6 +3153,7 @@ class VllmConfig:
             self.quant_config = VllmConfig._get_quantization_config(
                 self.model_config, self.load_config)
 
+        from vllm.platforms import current_platform
         if self.scheduler_config is not None and \
             self.model_config is not None and \
             self.scheduler_config.chunked_prefill_enabled and \
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 5b9236f8c56b6..e6768467f4c27 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -39,7 +39,6 @@ import vllm.distributed.kv_transfer.kv_transfer_agent as kv_transfer
 import vllm.envs as envs
 from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op, supports_custom_op
 
 if TYPE_CHECKING:
@@ -194,6 +193,7 @@ class GroupCoordinator:
         assert self.cpu_group is not None
         assert self.device_group is not None
 
+        from vllm.platforms import current_platform
         if current_platform.is_cuda_alike():
             self.device = torch.device(f"cuda:{local_rank}")
         else:
@@ -1188,6 +1188,7 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
         import ray  # Lazy import Ray
         ray.shutdown()
     gc.collect()
+    from vllm.platforms import current_platform
     if not current_platform.is_cpu():
         torch.cuda.empty_cache()
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 21966d003c7ef..69c7c5077fe32 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -18,7 +18,6 @@ from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat,
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-from vllm.platforms import current_platform
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, StoreBoolean
@@ -1094,6 +1093,7 @@ class EngineArgs:
                 use_sliding_window = (model_config.get_sliding_window()
                                       is not None)
                 use_spec_decode = self.speculative_model is not None
+                from vllm.platforms import current_platform
                 if (is_gpu and not use_sliding_window and not use_spec_decode
                         and not self.enable_lora
                         and not self.enable_prompt_adapter
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 426aa1b5c728f..8d766bad1a072 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -8,7 +8,6 @@ import msgspec
 from vllm.config import ParallelConfig
 from vllm.executor.msgspec_utils import decode_hook, encode_hook
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import get_ip
 from vllm.worker.worker_base import WorkerWrapperBase
@@ -229,6 +228,7 @@ def initialize_ray_cluster(
             the default Ray cluster address.
     """
     assert_ray_available()
+    from vllm.platforms import current_platform
 
     # Connect to a ray cluster.
     if current_platform.is_rocm() or current_platform.is_xpu():
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index 694c5b68b1cbd..18b435a42544a 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -6,7 +6,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding.utils import (
     convert_lark_to_gbnf, grammar_is_likely_lark,
     has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features)
-from vllm.platforms import CpuArchEnum, current_platform
+from vllm.platforms import CpuArchEnum
 
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer
@@ -39,6 +39,7 @@ def maybe_backend_fallback(
 
     if guided_params.backend == "xgrammar":
         # xgrammar only has x86 wheels for linux, fallback to outlines
+        from vllm.platforms import current_platform
         if current_platform.get_cpu_architecture() is not CpuArchEnum.X86:
             logger.warning("xgrammar is only supported on x86 CPUs. "
                            "Falling back to use outlines instead.")
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 67268eb4bb85f..07f4b5a3b3bc8 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -18,7 +18,6 @@ import cloudpickle
 import torch.nn as nn
 
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 
 from .interfaces import (has_inner_state, is_attention_free, is_hybrid,
                          supports_cross_encoding, supports_multimodal,
@@ -273,6 +272,7 @@ def _try_load_model_cls(
     model_arch: str,
     model: _BaseRegisteredModel,
 ) -> Optional[Type[nn.Module]]:
+    from vllm.platforms import current_platform
     current_platform.verify_model_arch(model_arch)
     try:
         return model.load_model_cls()
diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index 39ead08c238ce..6f1cc9d5e0c30 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -3,10 +3,9 @@ from typing import Any, Dict, Optional
 
 import torch
 
-from vllm.platforms import current_platform
-
 
 def set_random_seed(seed: int) -> None:
+    from vllm.platforms import current_platform
     current_platform.seed_everything(seed)
 
 
@@ -38,6 +37,7 @@ def set_weight_attrs(
         # This sometimes causes OOM errors during model loading. To avoid this,
         # we sync the param tensor after its weight loader is called.
         # TODO(woosuk): Remove this hack once we have a better solution.
+        from vllm.platforms import current_platform
         if current_platform.is_tpu() and key == "weight_loader":
             value = _make_synced_weight_loader(value)
         setattr(weight, key, value)
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 419237c252ffd..f6ac14446c021 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -1,123 +1,223 @@
+import logging
+import traceback
+from itertools import chain
+from typing import TYPE_CHECKING, Optional
+
+from vllm.plugins import load_plugins_by_group
+from vllm.utils import resolve_obj_by_qualname
+
 from .interface import _Backend  # noqa: F401
-from .interface import CpuArchEnum, Platform, PlatformEnum, UnspecifiedPlatform
+from .interface import CpuArchEnum, Platform, PlatformEnum
 
-current_platform: Platform
+logger = logging.getLogger(__name__)
 
-# NOTE: we don't use `torch.version.cuda` / `torch.version.hip` because
-# they only indicate the build configuration, not the runtime environment.
-# For example, people can install a cuda build of pytorch but run on tpu.
 
-is_tpu = False
-try:
-    # While it's technically possible to install libtpu on a non-TPU machine,
-    # this is a very uncommon scenario. Therefore, we assume that libtpu is
-    # installed if and only if the machine has TPUs.
-    import libtpu  # noqa: F401
-    is_tpu = True
-except Exception:
-    pass
-
-is_cuda = False
-
-try:
-    import pynvml
-    pynvml.nvmlInit()
+def tpu_platform_plugin() -> Optional[str]:
+    is_tpu = False
     try:
-        if pynvml.nvmlDeviceGetCount() > 0:
+        # While it's technically possible to install libtpu on a
+        # non-TPU machine, this is a very uncommon scenario. Therefore,
+        # we assume that libtpu is installed if and only if the machine
+        # has TPUs.
+        import libtpu  # noqa: F401
+        is_tpu = True
+    except Exception:
+        pass
+
+    return "vllm.platforms.tpu.TpuPlatform" if is_tpu else None
+
+
+def cuda_platform_plugin() -> Optional[str]:
+    is_cuda = False
+
+    try:
+        import pynvml
+        pynvml.nvmlInit()
+        try:
+            if pynvml.nvmlDeviceGetCount() > 0:
+                is_cuda = True
+        finally:
+            pynvml.nvmlShutdown()
+    except Exception:
+        # CUDA is supported on Jetson, but NVML may not be.
+        import os
+
+        def cuda_is_jetson() -> bool:
+            return os.path.isfile("/etc/nv_tegra_release") \
+                or os.path.exists("/sys/class/tegra-firmware")
+
+        if cuda_is_jetson():
             is_cuda = True
-    finally:
-        pynvml.nvmlShutdown()
-except Exception:
-    # CUDA is supported on Jetson, but NVML may not be.
-    import os
 
-    def cuda_is_jetson() -> bool:
-        return os.path.isfile("/etc/nv_tegra_release") \
-            or os.path.exists("/sys/class/tegra-firmware")
+    return "vllm.platforms.cuda.CudaPlatform" if is_cuda else None
 
-    if cuda_is_jetson():
-        is_cuda = True
 
-is_rocm = False
+def rocm_platform_plugin() -> Optional[str]:
+    is_rocm = False
 
-try:
-    import amdsmi
-    amdsmi.amdsmi_init()
     try:
-        if len(amdsmi.amdsmi_get_processor_handles()) > 0:
-            is_rocm = True
-    finally:
-        amdsmi.amdsmi_shut_down()
-except Exception:
-    pass
+        import amdsmi
+        amdsmi.amdsmi_init()
+        try:
+            if len(amdsmi.amdsmi_get_processor_handles()) > 0:
+                is_rocm = True
+        finally:
+            amdsmi.amdsmi_shut_down()
+    except Exception:
+        pass
 
-is_hpu = False
-try:
-    from importlib import util
-    is_hpu = util.find_spec('habana_frameworks') is not None
-except Exception:
-    pass
+    return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None
 
-is_xpu = False
 
-try:
-    # installed IPEX if the machine has XPUs.
-    import intel_extension_for_pytorch  # noqa: F401
-    import oneccl_bindings_for_pytorch  # noqa: F401
-    import torch
-    if hasattr(torch, 'xpu') and torch.xpu.is_available():
-        is_xpu = True
-except Exception:
-    pass
+def hpu_platform_plugin() -> Optional[str]:
+    is_hpu = False
+    try:
+        from importlib import util
+        is_hpu = util.find_spec('habana_frameworks') is not None
+    except Exception:
+        pass
 
-is_cpu = False
-try:
-    from importlib.metadata import version
-    is_cpu = "cpu" in version("vllm")
-except Exception:
-    pass
+    return "vllm.platforms.hpu.HpuPlatform" if is_hpu else None
 
-is_neuron = False
-try:
-    import transformers_neuronx  # noqa: F401
-    is_neuron = True
-except ImportError:
-    pass
 
-is_openvino = False
-try:
-    from importlib.metadata import version
-    is_openvino = "openvino" in version("vllm")
-except Exception:
-    pass
+def xpu_platform_plugin() -> Optional[str]:
+    is_xpu = False
 
-if is_tpu:
-    # people might install pytorch built with cuda but run on tpu
-    # so we need to check tpu first
-    from .tpu import TpuPlatform
-    current_platform = TpuPlatform()
-elif is_cuda:
-    from .cuda import CudaPlatform
-    current_platform = CudaPlatform()
-elif is_rocm:
-    from .rocm import RocmPlatform
-    current_platform = RocmPlatform()
-elif is_hpu:
-    from .hpu import HpuPlatform
-    current_platform = HpuPlatform()
-elif is_xpu:
-    from .xpu import XPUPlatform
-    current_platform = XPUPlatform()
-elif is_cpu:
-    from .cpu import CpuPlatform
-    current_platform = CpuPlatform()
-elif is_neuron:
-    from .neuron import NeuronPlatform
-    current_platform = NeuronPlatform()
-elif is_openvino:
-    from .openvino import OpenVinoPlatform
-    current_platform = OpenVinoPlatform()
-else:
-    current_platform = UnspecifiedPlatform()
+    try:
+        # installed IPEX if the machine has XPUs.
+        import intel_extension_for_pytorch  # noqa: F401
+        import oneccl_bindings_for_pytorch  # noqa: F401
+        import torch
+        if hasattr(torch, 'xpu') and torch.xpu.is_available():
+            is_xpu = True
+    except Exception:
+        pass
 
-__all__ = ['Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum']
+    return "vllm.platforms.xpu.XPUPlatform" if is_xpu else None
+
+
+def cpu_platform_plugin() -> Optional[str]:
+    is_cpu = False
+    try:
+        from importlib.metadata import version
+        is_cpu = "cpu" in version("vllm")
+    except Exception:
+        pass
+
+    return "vllm.platforms.cpu.CpuPlatform" if is_cpu else None
+
+
+def neuron_platform_plugin() -> Optional[str]:
+    is_neuron = False
+    try:
+        import transformers_neuronx  # noqa: F401
+        is_neuron = True
+    except ImportError:
+        pass
+
+    return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None
+
+
+def openvino_platform_plugin() -> Optional[str]:
+    is_openvino = False
+    try:
+        from importlib.metadata import version
+        is_openvino = "openvino" in version("vllm")
+    except Exception:
+        pass
+
+    return "vllm.platforms.openvino.OpenVinoPlatform" if is_openvino else None
+
+
+builtin_platform_plugins = {
+    'tpu': tpu_platform_plugin,
+    'cuda': cuda_platform_plugin,
+    'rocm': rocm_platform_plugin,
+    'hpu': hpu_platform_plugin,
+    'xpu': xpu_platform_plugin,
+    'cpu': cpu_platform_plugin,
+    'neuron': neuron_platform_plugin,
+    'openvino': openvino_platform_plugin,
+}
+
+
+def resolve_current_platform_cls_qualname() -> str:
+    platform_plugins = load_plugins_by_group('vllm.platform_plugins')
+
+    activated_plugins = []
+
+    for name, func in chain(builtin_platform_plugins.items(),
+                            platform_plugins.items()):
+        try:
+            assert callable(func)
+            platform_cls_qualname = func()
+            if platform_cls_qualname is not None:
+                activated_plugins.append(name)
+        except Exception:
+            pass
+
+    activated_builtin_plugins = list(
+        set(activated_plugins) & set(builtin_platform_plugins.keys()))
+    activated_oot_plugins = list(
+        set(activated_plugins) & set(platform_plugins.keys()))
+
+    if len(activated_oot_plugins) >= 2:
+        raise RuntimeError(
+            "Only one platform plugin can be activated, but got: "
+            f"{activated_oot_plugins}")
+    elif len(activated_oot_plugins) == 1:
+        platform_cls_qualname = platform_plugins[activated_oot_plugins[0]]()
+        logger.info("Platform plugin %s is activated",
+                    activated_oot_plugins[0])
+    elif len(activated_builtin_plugins) >= 2:
+        raise RuntimeError(
+            "Only one platform plugin can be activated, but got: "
+            f"{activated_builtin_plugins}")
+    elif len(activated_builtin_plugins) == 1:
+        platform_cls_qualname = builtin_platform_plugins[
+            activated_builtin_plugins[0]]()
+        logger.info("Automatically detected platform %s.",
+                    activated_builtin_plugins[0])
+    else:
+        platform_cls_qualname = "vllm.interface.UnspecifiedPlatform"
+        logger.info(
+            "No platform detected, vLLM is running on UnspecifiedPlatform")
+    return platform_cls_qualname
+
+
+_current_platform = None
+_init_trace: str = ''
+
+if TYPE_CHECKING:
+    current_platform: Platform
+
+
+def __getattr__(name: str):
+    if name == 'current_platform':
+        # lazy init current_platform.
+        # 1. out-of-tree platform plugins need `from vllm.platforms import
+        #    Platform` so that they can inherit `Platform` class. Therefore,
+        #    we cannot resolve `current_platform` during the import of
+        #    `vllm.platforms`.
+        # 2. when users use out-of-tree platform plugins, they might run
+        #    `import vllm`, some vllm internal code might access
+        #    `current_platform` during the import, and we need to make sure
+        #    `current_platform` is only resolved after the plugins are loaded
+        #    (we have tests for this, if any developer violate this, they will
+        #    see the test failures).
+        global _current_platform
+        if _current_platform is None:
+            platform_cls_qualname = resolve_current_platform_cls_qualname()
+            _current_platform = resolve_obj_by_qualname(
+                platform_cls_qualname)()
+            global _init_trace
+            _init_trace = "".join(traceback.format_stack())
+        return _current_platform
+    else:
+        return globals()[name]
+
+
+__all__ = [
+    'Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum',
+    "_init_trace"
+]
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 17f604ea0e202..c50eb2cef4cd5 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -1,10 +1,10 @@
 import logging
 import os
+from typing import Callable, Dict
 
 import torch
 
 import vllm.envs as envs
-from vllm.platforms import current_platform
 
 logger = logging.getLogger(__name__)
 
@@ -12,6 +12,39 @@ logger = logging.getLogger(__name__)
 plugins_loaded = False
 
 
+def load_plugins_by_group(group: str) -> Dict[str, Callable]:
+    import sys
+    if sys.version_info < (3, 10):
+        from importlib_metadata import entry_points
+    else:
+        from importlib.metadata import entry_points
+
+    allowed_plugins = envs.VLLM_PLUGINS
+
+    discovered_plugins = entry_points(group=group)
+    if len(discovered_plugins) == 0:
+        logger.debug("No plugins for group %s found.", group)
+        return {}
+    logger.info("Available plugins for group %s:", group)
+    for plugin in discovered_plugins:
+        logger.info("name=%s, value=%s", plugin.name, plugin.value)
+    if allowed_plugins is None:
+        logger.info("all available plugins for group %s will be loaded.",
+                    group)
+        logger.info("set environment variable VLLM_PLUGINS to control"
+                    " which plugins to load.")
+    plugins = {}
+    for plugin in discovered_plugins:
+        if allowed_plugins is None or plugin.name in allowed_plugins:
+            try:
+                func = plugin.load()
+                plugins[plugin.name] = func
+                logger.info("plugin %s loaded.", plugin.name)
+            except Exception:
+                logger.exception("Failed to load plugin %s", plugin.name)
+    return plugins
+
+
 def load_general_plugins():
     """WARNING: plugins can be loaded for multiple times in different
     processes. They should be designed in a way that they can be loaded
@@ -26,6 +59,9 @@ def load_general_plugins():
     os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
     # see https://github.com/vllm-project/vllm/issues/10619
     torch._inductor.config.compile_threads = 1
+
+    from vllm.platforms import current_platform
+
     if current_platform.is_xpu():
         # see https://github.com/pytorch/pytorch/blob/8cada5cbe5450e17c26fb8b358116785324537b2/torch/_dynamo/config.py#L158  # noqa
         os.environ['TORCH_COMPILE_DISABLE'] = 'True'
@@ -47,33 +83,7 @@ def load_general_plugins():
     if plugins_loaded:
         return
     plugins_loaded = True
-    import sys
-    if sys.version_info < (3, 10):
-        from importlib_metadata import entry_points
-    else:
-        from importlib.metadata import entry_points
-
-    allowed_plugins = envs.VLLM_PLUGINS
-
-    discovered_plugins = entry_points(group='vllm.general_plugins')
-    if len(discovered_plugins) == 0:
-        logger.debug("No plugins found.")
-        return
-    logger.info("Available plugins:")
-    for plugin in discovered_plugins:
-        logger.info("name=%s, value=%s, group=%s", plugin.name, plugin.value,
-                    plugin.group)
-    if allowed_plugins is None:
-        logger.info("all available plugins will be loaded.")
-        logger.info("set environment variable VLLM_PLUGINS to control"
-                    " which plugins to load.")
-    else:
-        logger.info("plugins to load: %s", allowed_plugins)
-    for plugin in discovered_plugins:
-        if allowed_plugins is None or plugin.name in allowed_plugins:
-            try:
-                func = plugin.load()
-                func()
-                logger.info("plugin %s loaded.", plugin.name)
-            except Exception:
-                logger.exception("Failed to load plugin %s", plugin.name)
+    plugins = load_plugins_by_group(group='vllm.general_plugins')
+    # general plugins, we only need to execute the loaded functions
+    for func in plugins.values():
+        func()
diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py
index 03dc46600d8a9..d678f4578499b 100644
--- a/vllm/spec_decode/metrics.py
+++ b/vllm/spec_decode/metrics.py
@@ -6,7 +6,6 @@ import torch
 
 from vllm.model_executor.layers.spec_decode_base_sampler import (
     SpecDecodeBaseSampler)
-from vllm.platforms import current_platform
 from vllm.utils import is_pin_memory_available
 
 
@@ -94,6 +93,7 @@ class AsyncMetricsCollector:
     def maybe_collect_rejsample_metrics(
             self, k: int) -> Optional[SpecDecodeWorkerMetrics]:
         # currently using cuda.Event, skip for any non_cuda_alike platform
+        from vllm.platforms import current_platform
         if not current_platform.is_cuda_alike():
             return None
 
diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index 9ae46ff43a916..a9deee881f41a 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -17,7 +17,6 @@ import torch
 
 import vllm.envs as envs
 from vllm.connections import global_http_connection
-from vllm.platforms import current_platform
 from vllm.version import __version__ as VLLM_VERSION
 
 _config_home = envs.VLLM_CONFIG_ROOT
@@ -152,6 +151,7 @@ class UsageMessage:
                            usage_context: UsageContext,
                            extra_kvs: Dict[str, Any]) -> None:
         # Platform information
+        from vllm.platforms import current_platform
         if current_platform.is_cuda_alike():
             device_property = torch.cuda.get_device_properties(0)
             self.gpu_count = torch.cuda.device_count()
diff --git a/vllm/utils.py b/vllm/utils.py
index 2b46c1fef0d09..8ef07d2c326a3 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -50,7 +50,6 @@ from typing_extensions import ParamSpec, TypeIs, assert_never
 
 import vllm.envs as envs
 from vllm.logger import enable_trace_function_call, init_logger
-from vllm.platforms import current_platform
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
@@ -609,6 +608,7 @@ def create_kv_caches_with_random_flash(
     seed: int = 0,
     device: Optional[str] = "cuda",
 ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+    from vllm.platforms import current_platform
     current_platform.seed_everything(seed)
 
     torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
@@ -650,7 +650,7 @@ def create_kv_caches_with_random(
         raise ValueError(
             f"Does not support key cache of type fp8 with head_size {head_size}"
         )
-
+    from vllm.platforms import current_platform
     current_platform.seed_everything(seed)
 
     torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
@@ -703,6 +703,7 @@ def print_warning_once(msg: str) -> None:
 
 @lru_cache(maxsize=None)
 def is_pin_memory_available() -> bool:
+    from vllm.platforms import current_platform
     return current_platform.is_pin_memory_available()
 
 
@@ -713,6 +714,7 @@ class DeviceMemoryProfiler:
 
     def current_memory_usage(self) -> float:
         # Return the memory usage in bytes.
+        from vllm.platforms import current_platform
         if current_platform.is_cuda_alike():
             torch.cuda.reset_peak_memory_stats(self.device)
             mem = torch.cuda.max_memory_allocated(self.device)
@@ -1066,6 +1068,7 @@ def _cuda_device_count_stateless(
     import torch.cuda
     import torch.version
 
+    from vllm.platforms import current_platform
     if not torch.cuda._is_compiled():
         return 0
     if current_platform.is_rocm():
@@ -1673,6 +1676,7 @@ def direct_register_custom_op(
         return
 
     if not supports_custom_op():
+        from vllm.platforms import current_platform
         assert not current_platform.is_cuda_alike(), (
             "cuda platform needs torch>=2.4 to support custom op, "
             "chances are you are using an old version of pytorch "
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index cd4770202a186..c7abad7e0258d 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -12,7 +12,6 @@ from torch import is_tensor
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
 
 if TYPE_CHECKING:
@@ -265,13 +264,13 @@ class ModelRunnerBase(ABC, Generic[T]):
         """
         raise NotImplementedError
 
-    @current_platform.inference_mode()
     def execute_model(
         self,
         model_input: T,
         kv_caches: Optional[List[torch.Tensor]],
-        intermediate_tensors: Optional[IntermediateTensors],
+        intermediate_tensors: Optional[IntermediateTensors] = None,
         num_steps: int = 1,
+        **kwargs,
     ) -> Optional[List[SamplerOutput]]:
         """
         Execute the model on the given input.
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index 65d9bab0e2822..dee63a75c0605 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -544,6 +544,7 @@ class MultiStepModelRunner(GPUModelRunnerBase[StatefulModelInput]):
         model_input.record_step_event(current_stream)
 
         if get_pp_group().is_last_rank and self.is_driver_worker:
+            assert isinstance(output, list)
             assert len(
                 output
             ) == 1, "MultiStepModelRunner requires single-step base_models"
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 3ac7fb8dfb766..249b3ed2dfd37 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -11,7 +11,6 @@ from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import (enable_trace_function_call_for_thread,
                         resolve_obj_by_qualname, update_environment_variables)
@@ -44,6 +43,8 @@ class WorkerBase(ABC):
         self.prompt_adapter_config = vllm_config.prompt_adapter_config
         self.observability_config = vllm_config.observability_config
         self.kv_transfer_config = vllm_config.kv_transfer_config
+        from vllm.platforms import current_platform
+        self.current_platform = current_platform
 
     @abstractmethod
     def init_device(self) -> None:
@@ -74,17 +75,17 @@ class WorkerBase(ABC):
         """
         raise NotImplementedError
 
-    @current_platform.inference_mode()
     def start_worker_execution_loop(self) -> None:
         """Execute model loop in parallel worker.
 
         You can stop the loop by executing a driver worker with an empty output.
         See `stop_remote_worker_execution_loop` for more details.
         """
-        while True:
-            output = self.execute_model(execute_model_req=None)
-            if output is None:
-                return None
+        with self.current_platform.inference_mode():
+            while True:
+                output = self.execute_model(execute_model_req=None)
+                if output is None:
+                    return None
 
     @abstractmethod
     def execute_model(
@@ -352,6 +353,7 @@ class LocalOrDistributedWorkerBase(WorkerBase):
         model_execute_time = time.perf_counter() - start_time
         if not get_pp_group().is_last_rank:
             # output is IntermediateTensors
+            assert isinstance(output, IntermediateTensors)
             if (self.observability_config is not None
                     and self.observability_config.collect_model_execute_time):
                 output.tensors["model_execute_time"] = torch.tensor(

From 8d9b6721e7f5b7d191951c6f1cd12710ffd08093 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 30 Dec 2024 23:01:35 +0800
Subject: [PATCH 33/48] [VLM] Abstract out multi-modal data parsing in merged
 processor (#11620)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml             |   4 +-
 vllm/model_executor/models/chatglm.py     |   4 +-
 vllm/model_executor/models/llava.py       |  18 +-
 vllm/model_executor/models/phi3v.py       |  19 +-
 vllm/model_executor/models/qwen2_audio.py |  22 +-
 vllm/model_executor/models/qwen2_vl.py    | 151 +++++-----
 vllm/model_executor/models/ultravox.py    |  22 +-
 vllm/multimodal/__init__.py               |   9 +-
 vllm/multimodal/audio.py                  |   4 +-
 vllm/multimodal/base.py                   |   8 +-
 vllm/multimodal/image.py                  |   4 +-
 vllm/multimodal/inputs.py                 | 195 ++++--------
 vllm/multimodal/parse.py                  | 344 ++++++++++++++++++++++
 vllm/multimodal/processing.py             |  62 ++--
 vllm/multimodal/video.py                  |   4 +-
 15 files changed, 559 insertions(+), 311 deletions(-)
 create mode 100644 vllm/multimodal/parse.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index bee968b4d2e43..c6f8316412e2f 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -356,7 +356,7 @@ steps:
     - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
     - pytest -v -s models/embedding/language -m 'not core_model'
 
-- label: Multi-Modal Models Test (Standard) # 28min
+- label: Multi-Modal Models Test (Standard) # 40min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
@@ -372,7 +372,7 @@ steps:
     - pytest -v -s models/encoder_decoder/language -m core_model
     - pytest -v -s models/encoder_decoder/vision_language -m core_model
 
-- label: Multi-Modal Models Test (Extended) 1 # 1h16m
+- label: Multi-Modal Models Test (Extended) 1 # 48m
   optional: true
   source_file_dependencies:
   - vllm/
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 6c50882d83c3b..ffd6891b25965 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -33,7 +33,7 @@ from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalData, MultiModalKwargs,
+from vllm.multimodal.inputs import (ModalityData, MultiModalKwargs,
                                     NestedTensors)
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
@@ -54,7 +54,7 @@ def calculate_image_placeholder(vision_config):
 
 def mm_input_mapper_for_glmv(
     ctx: InputContext,
-    data: MultiModalData[object],
+    data: ModalityData[object],
 ) -> Dict:
     model_config = ctx.model_config
     tokenizer = cached_get_tokenizer(
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 0ecba5a1cae0f..1d6ee2a0be72e 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -20,11 +20,13 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalDataItems,
-                                    MultiModalFieldConfig, MultiModalInputsV2,
-                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputsV2, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import ImageProcessorItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        ProcessorInputs, PromptReplacement,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement,
                                         full_groupby_modality)
 from vllm.sequence import IntermediateTensors
 
@@ -179,7 +181,9 @@ class LlavaMultiModalProcessor(BaseMultiModalProcessor):
             assert isinstance(vision_config, PixtralVisionConfig)
 
             def get_replacement_pixtral(item_idx: int):
-                image_size = mm_items.get_image_size(item_idx)
+                images = mm_items.get_items("image", ImageProcessorItems)
+                image_size = images.get_image_size(item_idx)
+
                 (
                     num_width_tokens,
                     num_height_tokens,
@@ -591,8 +595,8 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
 
         result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
 
-        mm_items = self._get_mm_items(mm_data)
-        mm_item_counts = mm_items.get_item_counts()
+        mm_items = self._to_mm_items(mm_data)
+        mm_item_counts = mm_items.get_all_counts()
         mm_kwargs = result["mm_kwargs"]
 
         # We reimplement the functionality of MLlavaProcessor from
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index fefa9fd62d1d0..15362db6cdfbf 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -32,12 +32,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalDataItems,
-                                    MultiModalFieldConfig, MultiModalInputsV2,
-                                    MultiModalKwargs, NestedTensors,
-                                    PlaceholderRange)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputsV2, MultiModalKwargs,
+                                    NestedTensors, PlaceholderRange)
+from vllm.multimodal.parse import ImageProcessorItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        ProcessorInputs, PromptReplacement,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement,
                                         _BoundPromptReplacement,
                                         _PlaceholderInfo)
 from vllm.sequence import IntermediateTensors
@@ -381,7 +382,9 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor):
         assert isinstance(bos_token_id, int)
 
         def get_replacement_phi3v(item_idx: int):
-            image_size = mm_items.get_image_size(item_idx)
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
+
             num_tokens = image_processor.calc_num_image_tokens_from_image_size(
                 width=image_size.width,
                 height=image_size.height,
@@ -389,12 +392,14 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor):
 
             return [_IMAGE_TOKEN_ID] * num_tokens + [bos_token_id]
 
+        num_images = mm_items.get_count("image", strict=False)
+
         return [
             PromptReplacement(
                 modality="image",
                 target=image_token,
                 replacement=get_replacement_phi3v,
-            ) for image_token in image_tokens[:len(mm_items.images)]
+            ) for image_token in image_tokens[:num_images]
         ]
 
     def _apply_prompt_replacements(
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 25a351bd9c656..e3d43b017f894 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -20,8 +20,8 @@
 # limitations under the License.
 """Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
 from functools import cached_property
-from typing import (Any, Iterable, List, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union)
+from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
+                    Union)
 
 import numpy as np
 import torch
@@ -38,10 +38,12 @@ from vllm.inputs import InputContext
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataItems, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        ProcessorInputs, PromptReplacement)
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -99,15 +101,9 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor):
     def _get_feature_extractor(self) -> WhisperFeatureExtractor:
         return self._get_hf_processor().feature_extractor  # type: ignore
 
-    def _get_hf_mm_data(
-        self,
-        mm_items: MultiModalDataItems,
-    ) -> tuple[dict[str, Any], dict[str, Any]]:
-        # resample audio to the model's sampling rate
+    def _get_data_parser(self) -> MultiModalDataParser:
         feature_extractor = self._get_feature_extractor()
-        mm_items.resample_audios(feature_extractor.sampling_rate)
-
-        return super()._get_hf_mm_data(mm_items)
+        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
 
     def _call_hf_processor(
         self,
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 574845ef5a525..6181fe3dd13d8 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -25,7 +25,6 @@ from functools import cached_property, partial
 from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional,
                     Set, Tuple, Type, TypedDict, Union)
 
-import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -55,15 +54,16 @@ from vllm.model_executor.layers.quantization.gptq_marlin import (
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalDataItems,
+from vllm.multimodal.inputs import (ImageItem, ModalityData,
                                     MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+                                    NestedTensors, VideoItem)
+from vllm.multimodal.parse import ModalityDataItems, MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        ProcessorInputs, PromptReplacement)
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
-from vllm.utils import is_list_of
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, get_vit_attn_backend,
@@ -719,61 +719,81 @@ get_max_qwen2_vl_video_tokens = partial(get_max_qwen2_vl_mm_tokens,
                                         data_type_key="video")
 
 
-class Qwen2VLMultiModalDataItems(MultiModalDataItems):
+class Qwen2EmbeddingItems(ModalityDataItems[dict[str, torch.Tensor],
+                                            dict[str, torch.Tensor]]):
 
-    @staticmethod
-    def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems":
-        """
-        Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
-        """
-        multi_data = Qwen2VLMultiModalDataItems()
+    def __init__(self, data: dict, modality: str) -> None:
+        super().__init__(data)
 
-        for k, v in data.items():
-            # TODO: Make a separate modality for embedding inputs
-            # to avoid confusion
-            # yapf: disable
-            if k == "video":
-                # Special case since even a single item can be a list
-                multi_data[k] = (  # type: ignore[index]
-                    v if (
-                        isinstance(v, (dict, torch.Tensor))  # type: ignore[assignment]
-                        or is_list_of(v, list)
-                        or isinstance(v[0], (np.ndarray, torch.Tensor))
-                           and v[0].ndim == 4
-                    ) else [v]
-                )
-            elif k in ("image", "audio"):
-                multi_data[k] = (  # type: ignore[index]
-                    v if isinstance(v, (dict, torch.Tensor, list)) else [v]
-                )
-            else:
-                multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
-            # yapf: enable
+        self.modality = modality
 
-        return multi_data
+        grid_thw = data[f"{modality}_grid_thw"]
+        slice_idxs = [0] + grid_thw.prod(-1).cumsum_(0).tolist()
+        self._slices = [
+            slice(slice_idxs[i], slice_idxs[i + 1])
+            for i in range(len(grid_thw))
+        ]
 
-    def get_item_counts(self) -> Mapping[str, int]:
-        return {
-            m: (
-                len(items[f"{m}_grid_thw"])  # type: ignore
-                if isinstance(items, dict) else len(items))
-            for m, items in self.items()
-        }
+    def __repr__(self) -> str:
+        return (f"{type(self).__name__}(modality={self.modality!r})")
 
-    def has_embedding_inputs(self) -> bool:
-        return any(
-            isinstance(items, dict) or any(
-                isinstance(item, torch.Tensor) for item in items)
-            for items in self.values())
+    def get_count(self) -> int:
+        return len(self.data[f"{self.modality}_grid_thw"])
+
+    def get(self, index: int) -> dict[str, torch.Tensor]:
+        out = {}
+        for k, v in self.data.items():
+            if v != f"{self.modality}_grid_thw":
+                v = v[self._slices[index]]
+
+            out[k] = v
+
+        return out
+
+    def get_processor_data(self) -> Mapping[str, object]:
+        return {}
+
+    def get_passthrough_data(self) -> Mapping[str, object]:
+        return self.data
+
+
+class Qwen2ImageEmbeddingItems(Qwen2EmbeddingItems):
+
+    def __init__(self, data: dict) -> None:
+        super().__init__(data, "image")
+
+
+class Qwen2VideoEmbeddingItems(Qwen2EmbeddingItems):
+
+    def __init__(self, data: dict) -> None:
+        super().__init__(data, "video")
+
+
+class Qwen2MultiModalDataParser(MultiModalDataParser):
+
+    def _parse_image_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return Qwen2EmbeddingItems(data, modality="image")
+
+        return super()._parse_image_data(data)
+
+    def _parse_video_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return Qwen2EmbeddingItems(data, modality="video")
+
+        return super()._parse_video_data(data)
 
 
 class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
 
-    def _get_mm_items(
-        self,
-        mm_data: MultiModalDataDict,
-    ) -> MultiModalDataItems:
-        return Qwen2VLMultiModalDataItems.from_dict(mm_data)
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return Qwen2MultiModalDataParser()
 
     def _get_hf_processor(
         self,
@@ -796,35 +816,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
 
         return hf_processor
 
-    def _get_hf_mm_data(
-        self,
-        mm_items: MultiModalDataItems,
-    ) -> tuple[dict[str, Any], dict[str, Any]]:
-        processor_data = dict[str, Any]()
-        passthrough_data = dict[str, Any]()
-
-        for k, v in mm_items.items():
-            # TODO: Make a separate modality for embedding inputs
-            # to avoid confusion
-            if k in ("image", "video", "audio"):
-                if isinstance(v, dict):
-                    # Pass through embedding inputs (dict)
-                    passthrough_data.update(v)
-                elif isinstance(v, torch.Tensor) and v.ndim == 3:
-                    # Pass through embedding inputs (single)
-                    passthrough_data[f"{k}_embeds"] = [v]
-                elif (is_list_of(v, torch.Tensor) and len(v) > 0
-                      and v[0].ndim == 2):
-                    # Pass through embedding inputs (multi)
-                    passthrough_data[f"{k}_embeds"] = v
-                elif len(v) > 0:
-                    # Map keys to plural form, e.g.: image -> images
-                    processor_data[f"{k}s"] = v
-            else:
-                processor_data[k] = v
-
-        return processor_data, passthrough_data
-
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 7b4aeeec5f403..7e853e5b90096 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -3,8 +3,8 @@
 
 import math
 from functools import cached_property, lru_cache
-from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set,
-                    Tuple, TypedDict, Union)
+from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
+                    TypedDict, Union)
 
 import numpy as np
 import torch
@@ -24,10 +24,12 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.loader import DefaultModelLoader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataItems, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        ProcessorInputs, PromptReplacement)
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 from vllm.utils import is_list_of
@@ -85,15 +87,9 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
         hf_processor = self._get_hf_processor()
         return hf_processor.audio_processor.feature_extractor  # type: ignore
 
-    def _get_hf_mm_data(
-        self,
-        mm_items: MultiModalDataItems,
-    ) -> tuple[dict[str, Any], dict[str, Any]]:
-        # resample audio to the model's sampling rate
+    def _get_data_parser(self) -> MultiModalDataParser:
         feature_extractor = self._get_feature_extractor()
-        mm_items.resample_audios(feature_extractor.sampling_rate)
-
-        return super()._get_hf_mm_data(mm_items)
+        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
 
     def _call_hf_processor(
         self,
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 9255e062e4870..e58bbe81717a0 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,8 +1,7 @@
 from .base import MultiModalPlaceholderMap, MultiModalPlugin
-from .inputs import (BatchedTensorInputs, MultiModalData,
-                     MultiModalDataBuiltins, MultiModalDataDict,
-                     MultiModalKwargs, MultiModalPlaceholderDict,
-                     NestedTensors)
+from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
+                     MultiModalDataDict, MultiModalKwargs,
+                     MultiModalPlaceholderDict, NestedTensors)
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
@@ -16,7 +15,7 @@ See also:
 
 __all__ = [
     "BatchedTensorInputs",
-    "MultiModalData",
+    "ModalityData",
     "MultiModalDataBuiltins",
     "MultiModalDataDict",
     "MultiModalKwargs",
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index 3e09ef1fcbb56..de80f22bac2a3 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -9,7 +9,7 @@ from vllm.inputs.registry import InputContext
 from vllm.utils import PlaceholderModule
 
 from .base import MediaIO, MultiModalPlugin
-from .inputs import AudioItem, MultiModalData, MultiModalKwargs
+from .inputs import AudioItem, ModalityData, MultiModalKwargs
 
 try:
     import librosa
@@ -31,7 +31,7 @@ class AudioPlugin(MultiModalPlugin):
     def _default_input_mapper(
         self,
         ctx: InputContext,
-        data: MultiModalData[AudioItem],
+        data: ModalityData[AudioItem],
         **mm_processor_kwargs,
     ) -> MultiModalKwargs:
         raise NotImplementedError("There is no default audio input mapper")
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index cdda6f8052794..7f4029e726332 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -15,12 +15,12 @@ if TYPE_CHECKING:
     from vllm.config import ModelConfig
     from vllm.sequence import SequenceGroupMetadata
 
-from .inputs import (MultiModalData, MultiModalDataDict, MultiModalKwargs,
+from .inputs import (ModalityData, MultiModalDataDict, MultiModalKwargs,
                      PlaceholderRange)
 
 logger = init_logger(__name__)
 
-MultiModalInputMapper = Callable[[InputContext, MultiModalData[object]],
+MultiModalInputMapper = Callable[[InputContext, ModalityData[object]],
                                  MultiModalKwargs]
 """
 Return a dictionary to be passed as keyword arguments to
@@ -69,7 +69,7 @@ class MultiModalPlugin(ABC):
     def _default_input_mapper(
         self,
         ctx: InputContext,
-        data: MultiModalData[Any],
+        data: ModalityData[Any],
         **mm_processor_kwargs,
     ) -> MultiModalKwargs:
         """
@@ -118,7 +118,7 @@ class MultiModalPlugin(ABC):
     def map_input(
         self,
         model_config: "ModelConfig",
-        data: MultiModalData[Any],
+        data: ModalityData[Any],
         mm_processor_kwargs: Optional[dict[str, Any]],
     ) -> MultiModalKwargs:
         """
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 14c79dfadec0c..da13a381c4530 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -13,7 +13,7 @@ from vllm.transformers_utils.processor import get_image_processor
 from vllm.utils import is_list_of
 
 from .base import MediaIO, MultiModalPlugin
-from .inputs import ImageItem, MultiModalData, MultiModalKwargs
+from .inputs import ImageItem, ModalityData, MultiModalKwargs
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -44,7 +44,7 @@ class ImagePlugin(MultiModalPlugin):
     def _default_input_mapper(
         self,
         ctx: InputContext,
-        data: MultiModalData[ImageItem],
+        data: ModalityData[ImageItem],
         **mm_processor_kwargs,
     ) -> MultiModalKwargs:
         model_config = ctx.model_config
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 1fbda6e0b8750..db489af7ac475 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -2,53 +2,74 @@ from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
 from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
-from typing import (Any, Literal, NamedTuple, TypedDict, TypeVar, Union, cast,
-                    final)
+from typing import Any, Literal, TypedDict, TypeVar, Union, cast, final
 
 import numpy as np
 import torch
 import torch.types
 from PIL.Image import Image
 from transformers import BatchFeature
-from typing_extensions import NotRequired, TypeAlias, assert_never
+from typing_extensions import NotRequired, TypeAlias
 
 from vllm.utils import JSONTree, is_list_of, json_map_leaves
 
 _T = TypeVar("_T")
 
-# yapf: disable
-ImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor]
+HfImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor]
 """
 A :class:`transformers.image_utils.ImageInput` representing a single image
 item, which can be passed to a HuggingFace :code:`ImageProcessor`.
 """
 
-VideoItem: TypeAlias = Union[
-    list[Image],
-    np.ndarray,
-    torch.Tensor,
-    list[np.ndarray],
-    list[torch.Tensor],
-]
+HfVideoItem: TypeAlias = Union[list[Image], np.ndarray, torch.Tensor,
+                               list[np.ndarray], list[torch.Tensor]]
 """
 A :class:`transformers.image_utils.VideoInput` representing a single video
 item, which can be passed to a HuggingFace :code:`VideoProcessor`.
 """
 
-AudioItem: TypeAlias = Union[
-    np.ndarray,
-    list[float],
-    # `(audio, sampling_rate)`: If the audio's sampling rate is different
-    # from that expected by the model, we need to resample it.
-    tuple[np.ndarray, float],
-]
+HfAudioItem: TypeAlias = Union[list[float], np.ndarray, torch.Tensor]
 """
 Represents a single audio
 item, which can be passed to a HuggingFace :code:`AudioProcessor`.
 """
-# yapf: enable
 
-MultiModalData: TypeAlias = Union[_T, list[_T]]
+ImageItem: TypeAlias = Union[HfImageItem, torch.Tensor]
+"""
+A :class:`transformers.image_utils.ImageInput` representing a single image
+item, which can be passed to a HuggingFace :code:`ImageProcessor`.
+
+Alternatively, a 3-D tensor or batch of 2-D tensors,
+which are treated as image embeddings;
+these are directly passed to the model without HF processing.
+"""
+
+VideoItem: TypeAlias = Union[HfVideoItem, torch.Tensor]
+"""
+A :class:`transformers.image_utils.VideoInput` representing a single video
+item, which can be passed to a HuggingFace :code:`VideoProcessor`.
+
+Alternatively, a 3-D tensor or batch of 2-D tensors,
+which are treated as video embeddings;
+these are directly passed to the model without HF processing.
+"""
+
+AudioItem: TypeAlias = Union[HfAudioItem, tuple[np.ndarray, float],
+                             torch.Tensor]
+"""
+Represents a single audio
+item, which can be passed to a HuggingFace :code:`AudioProcessor`.
+
+Alternatively, a tuple `(audio, sampling_rate)`, where the sampling rate
+is different from that expected by the model;
+these are resampled to the model's sampling rate before being processed by HF.
+
+Alternatively, a 3-D tensor or batch of 2-D tensors,
+which are treated as audio embeddings;
+these are directly passed to the model without HF processing.
+"""
+
+ModalityData: TypeAlias = Union[_T, list[_T]]
 """
 Either a single data item, or a list of data items.
 
@@ -61,17 +82,17 @@ The number of data items allowed per modality is restricted by
 class MultiModalDataBuiltins(TypedDict, total=False):
     """Type annotations for modality types predefined by vLLM."""
 
-    image: MultiModalData[ImageItem]
+    image: ModalityData[ImageItem]
     """The input image(s)."""
 
-    video: MultiModalData[VideoItem]
+    video: ModalityData[VideoItem]
     """The input video(s)."""
 
-    audio: MultiModalData[AudioItem]
+    audio: ModalityData[AudioItem]
     """The input audio(s)."""
 
 
-MultiModalDataDict: TypeAlias = Mapping[str, MultiModalData[Any]]
+MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
 """
 A dictionary containing an entry for each modality type to input.
 
@@ -83,123 +104,6 @@ Note:
 """
 
 
-class ImageSize(NamedTuple):
-    width: int
-    height: int
-
-
-class MultiModalDataItems(UserDict[str, list[Any]]):
-    """
-    As :class:`MultiModalDataDict`, but normalized such that each entry
-    corresponds to a list.
-    """
-
-    @staticmethod
-    def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems":
-        """
-        Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
-        """
-        multi_data = MultiModalDataItems()
-
-        for k, v in data.items():
-            # TODO: Make a separate modality for embedding inputs
-            # to avoid confusion
-            # yapf: disable
-            if k == "video":
-                # Special case since even a single item can be a list
-                multi_data[k] = (  # type: ignore[index]
-                    v if (
-                        isinstance(v, torch.Tensor)
-                        or is_list_of(v, list)
-                        or isinstance(v[0], (np.ndarray, torch.Tensor))
-                           and v[0].ndim == 4
-                    ) else [v]
-                )
-            elif k in ("image", "audio"):
-                multi_data[k] = (  # type: ignore[index]
-                    v if isinstance(v, (torch.Tensor, list)) else [v]
-                )
-            else:
-                multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
-            # yapf: enable
-
-        return multi_data
-
-    # NOTE: When a field (e.g. `images`) doesn't exist, directly appending to
-    # `self.images` doesn't update this dictionary, which may be confusing
-    # We annotate the getter methods as `Sequence` to prevent others from
-    # trying to update the list in this way
-    @property
-    def images(self) -> Sequence[ImageItem]:
-        return self.get("image", [])
-
-    @property
-    def videos(self) -> Sequence[VideoItem]:
-        return self.get("video", [])
-
-    @property
-    def audios(self) -> Sequence[AudioItem]:
-        return self.get("audio", [])
-
-    def get_item_counts(self) -> Mapping[str, int]:
-        return {m: len(items) for m, items in self.items()}
-
-    def has_embedding_inputs(self) -> bool:
-        return any(
-            any(isinstance(item, torch.Tensor) for item in items)
-            for items in self.values())
-
-    def get_image_size(self, item_idx: int) -> ImageSize:
-        image = self.images[item_idx]
-
-        if isinstance(image, Image):
-            return ImageSize(*image.size)
-        if isinstance(image, (np.ndarray, torch.Tensor)):
-            _, h, w = image.shape
-            return ImageSize(w, h)
-
-        assert_never(image)
-
-    def get_audio_with_sr(
-        self,
-        item_idx: int,
-        *,
-        default_sr: float,
-    ) -> tuple[np.ndarray, float]:
-        audio = self.audios[item_idx]
-
-        if isinstance(audio, tuple):
-            return audio
-        if isinstance(audio, list):
-            return np.array(audio), default_sr
-        if isinstance(audio, np.ndarray):
-            return audio, default_sr
-
-        assert_never(audio)
-
-    def resample_audios(self, new_sr: float, *, drop_sr: bool = True) -> None:
-        """
-        If :code:`drop_sr=True`, the audio items in this dictionary are updated
-        to be NumPy arrays which implicitly means that their sampling rate is
-        the same as the model's expected sampling rate; otherwise, they remain
-        as :code:`(audio, new_sr)` tuples.
-        """
-        # Avoid circular import
-        from .audio import resample_audio
-
-        if not self.audios:
-            return
-
-        new_audios = []
-        for item_idx in range(len(self.audios)):
-            audio, sr = self.get_audio_with_sr(item_idx, default_sr=new_sr)
-            audio = resample_audio(audio, orig_sr=sr, target_sr=new_sr)
-
-            new_audios.append(audio if drop_sr else (audio, new_sr))
-
-        self["audio"] = new_audios
-
-
 class PlaceholderRange(TypedDict):
     """
     Placeholder location information for multi-modal data.
@@ -436,7 +340,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
     ) -> "MultiModalKwargs":
         data = {
             key: items[0].field.reduce(items).data
-            for key, items in items_by_key.items()
+            for key, items in items_by_key.items() if len(items) > 0
         }
 
         return MultiModalKwargs(data,
@@ -567,6 +471,11 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
         Get the keyword arguments corresponding to an item identified by
         its modality and index.
         """
+        if modality not in self._keys_by_modality:
+            available_modalities = set(self._keys_by_modality.keys())
+            raise KeyError(f"Modality {modality!r} not found. "
+                           f"Available modalities: {available_modalities}")
+
         keys_to_gather = self._keys_by_modality[modality]
 
         return {
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
new file mode 100644
index 0000000000000..17a795247372e
--- /dev/null
+++ b/vllm/multimodal/parse.py
@@ -0,0 +1,344 @@
+from abc import ABC, abstractmethod
+from collections import UserDict
+from collections.abc import Callable, Iterator, Mapping, Sequence
+from typing import TYPE_CHECKING, Any, Generic, NamedTuple, Optional, TypeVar
+
+import numpy as np
+import torch
+from PIL.Image import Image
+from typing_extensions import TypeAlias, TypeGuard, assert_never
+
+from vllm.utils import is_list_of
+
+from .audio import resample_audio
+from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem,
+                     ImageItem, ModalityData, MultiModalDataDict,
+                     NestedTensors, VideoItem)
+
+_T = TypeVar("_T")
+_I = TypeVar("_I")
+
+
+class ModalityDataItems(ABC, Generic[_T, _I]):
+
+    def __init__(self, data: _T) -> None:
+        super().__init__()
+
+        self.data = data
+
+    def __len__(self) -> int:
+        return self.get_count()
+
+    def __getitem__(self, index: int) -> _I:
+        return self.get(index)
+
+    if TYPE_CHECKING:
+        # Auto-generated
+        def __iter__(self) -> Iterator[_I]:
+            ...
+
+    @abstractmethod
+    def get_count(self) -> int:
+        """Get the number of data items."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def get(self, index: int) -> _I:
+        """Get a data item by its index."""
+        raise NotImplementedError
+
+    def get_all(self) -> list[_I]:
+        """Get all data items."""
+        return [self.get(idx) for idx in range(self.get_count())]
+
+    @abstractmethod
+    def get_processor_data(self) -> Mapping[str, object]:
+        """Get the data to pass to the HF processor."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_passthrough_data(self) -> Mapping[str, object]:
+        """Get the data to pass directly to the model."""
+        raise NotImplementedError
+
+
+class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
+
+    def __init__(self, data: Sequence[_T], modality: str) -> None:
+        super().__init__(data)
+
+        self.modality = modality
+
+    def __repr__(self) -> str:
+        return (f"{type(self).__name__}(modality={self.modality!r})")
+
+    def get_count(self) -> int:
+        return len(self.data)
+
+    def get(self, index: int) -> _T:
+        return self.data[index]
+
+    def get_processor_data(self) -> Mapping[str, object]:
+        return {f"{self.modality}s": self.data}
+
+    def get_passthrough_data(self) -> Mapping[str, object]:
+        return {}
+
+
+class EmbeddingItems(ModalityDataItems[NestedTensors, torch.Tensor]):
+
+    def __init__(self, data: NestedTensors, modality: str) -> None:
+        super().__init__(data)
+
+        self.modality = modality
+
+    def __repr__(self) -> str:
+        return (f"{type(self).__name__}(modality={self.modality!r})")
+
+    def get_count(self) -> int:
+        return len(self.data)
+
+    def get(self, index: int) -> object:
+        return self.data[index]
+
+    def get_processor_data(self) -> Mapping[str, object]:
+        return {}
+
+    def get_passthrough_data(self) -> Mapping[str, object]:
+        return {f"{self.modality}_embeds": self.data}
+
+
+class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):
+
+    def __init__(self, data: Sequence[HfAudioItem]) -> None:
+        super().__init__(data, "audio")
+
+
+class AudioEmbeddingItems(EmbeddingItems):
+
+    def __init__(self, data: NestedTensors) -> None:
+        super().__init__(data, "audio")
+
+
+class ImageSize(NamedTuple):
+    width: int
+    height: int
+
+
+class ImageProcessorItems(ProcessorBatchItems[HfImageItem]):
+
+    def __init__(self, data: Sequence[HfImageItem]) -> None:
+        super().__init__(data, "image")
+
+    def get_image_size(self, item_idx: int) -> ImageSize:
+        image = self.get(item_idx)
+
+        if isinstance(image, Image):
+            return ImageSize(*image.size)
+        if isinstance(image, (np.ndarray, torch.Tensor)):
+            _, h, w = image.shape
+            return ImageSize(w, h)
+
+        assert_never(image)
+
+
+class ImageEmbeddingItems(EmbeddingItems):
+
+    def __init__(self, data: NestedTensors) -> None:
+        super().__init__(data, "image")
+
+
+class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):
+
+    def __init__(self, data: Sequence[HfVideoItem]) -> None:
+        super().__init__(data, "video")
+
+
+class VideoEmbeddingItems(EmbeddingItems):
+
+    def __init__(self, data: NestedTensors) -> None:
+        super().__init__(data, "video")
+
+
+_D = TypeVar("_D", bound=ModalityDataItems[Any, Any])
+
+
+class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
+    """
+    As :class:`MultiModalDataDict`, but normalized such that each entry
+    corresponds to a list.
+    """
+
+    def get_count(self, modality: str, *, strict: bool = True) -> int:
+        """
+        Get the number of data items belonging to a modality.
+        
+        If `strict=False`, return `0` instead of raising :exc:`KeyError`
+        even if the modality is not found.
+        """
+        if modality not in self:
+            if strict:
+                available_modalities = set(self.keys())
+                raise KeyError(f"Modality {modality!r} not found. "
+                               f"Available modalities: {available_modalities}")
+
+            return 0
+
+        return self[modality].get_count()
+
+    def get_all_counts(self) -> Mapping[str, int]:
+        """Get the number of items belonging to each modality."""
+        return {m: items.get_count() for m, items in self.items()}
+
+    def get_items(
+        self,
+        modality: str,
+        typ: type[_D],
+    ) -> _D:
+        """
+        Get the data items belonging to a modality,
+        requiring that they belong to a certain type.
+        """
+        if modality not in self:
+            available_modalities = set(self.keys())
+            raise KeyError(f"Modality {modality!r} not found. "
+                           f"Available modalities: {available_modalities}")
+
+        items = self[modality]
+        if not isinstance(items, typ):
+            raise TypeError(f"Invalid type of data items for {modality=}. "
+                            f"Expected type: {typ}, but "
+                            f"found type: {type(items)}")
+
+        return items
+
+
+ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],
+                                         ModalityDataItems[Any, Any]]
+
+
+class MultiModalDataParser:
+    """
+    Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`.
+    """
+
+    def __init__(self, *, target_sr: Optional[float] = None) -> None:
+        super().__init__()
+
+        self.target_sr = target_sr
+
+    def _is_embeddings(self, data: object) -> TypeGuard[NestedTensors]:
+        if isinstance(data, torch.Tensor):
+            return data.ndim == 3
+        if is_list_of(data, torch.Tensor):
+            return len(data) == 0 or data[0].ndim == 2
+
+        return False
+
+    def _get_audio_with_sr(
+        self,
+        audio: AudioItem,
+    ) -> tuple[np.ndarray, Optional[float]]:
+        if isinstance(audio, tuple):
+            return audio
+        if isinstance(audio, list):
+            return np.array(audio), None
+        if isinstance(audio, np.ndarray):
+            return audio, None
+        if isinstance(audio, torch.Tensor):
+            return audio.numpy(), None
+
+        assert_never(audio)
+
+    def _parse_audio_data(
+        self,
+        data: ModalityData[AudioItem],
+    ) -> ModalityDataItems[Any, Any]:
+        if self._is_embeddings(data):
+            return AudioEmbeddingItems(data)
+
+        if (is_list_of(data, float)
+                or isinstance(data,
+                              (np.ndarray, torch.Tensor)) and data.ndim == 1
+                or isinstance(data, tuple)):
+            data_items = [data]
+        elif isinstance(data, (np.ndarray, torch.Tensor)):
+            data_items = [elem for elem in data]
+        else:
+            data_items = data
+
+        new_audios = list[np.ndarray]()
+        for data_item in data_items:
+            audio, orig_sr = self._get_audio_with_sr(data_item)
+            if orig_sr is None:
+                new_audio = audio
+            else:
+                target_sr = self.target_sr
+                if target_sr is None:
+                    raise RuntimeError(
+                        "Audio resampling is not supported when "
+                        "`target_sr` is not provided")
+
+                new_audio = resample_audio(audio,
+                                           orig_sr=orig_sr,
+                                           target_sr=target_sr)
+
+            new_audios.append(new_audio)
+
+        return AudioProcessorItems(new_audios)
+
+    def _parse_image_data(
+        self,
+        data: ModalityData[ImageItem],
+    ) -> ModalityDataItems[Any, Any]:
+        if self._is_embeddings(data):
+            return ImageEmbeddingItems(data)
+
+        if (isinstance(data, Image)
+                or isinstance(data,
+                              (np.ndarray, torch.Tensor)) and data.ndim == 3):
+            data_items = [data]
+        elif isinstance(data, (np.ndarray, torch.Tensor)):
+            data_items = [elem for elem in data]
+        else:
+            data_items = data
+
+        return ImageProcessorItems(data_items)
+
+    def _parse_video_data(
+        self,
+        data: ModalityData[VideoItem],
+    ) -> ModalityDataItems[Any, Any]:
+        if self._is_embeddings(data):
+            return VideoEmbeddingItems(data)
+
+        if (is_list_of(data, Image)
+                or isinstance(data,
+                              (np.ndarray, torch.Tensor)) and data.ndim == 4):
+            data_items = [data]
+        elif isinstance(data, (np.ndarray, torch.Tensor)):
+            data_items = [elem for elem in data]
+        else:
+            data_items = data
+
+        return VideoProcessorItems(data_items)
+
+    def _get_subparsers(self) -> Mapping[str, ModalityDataParser]:
+        return {
+            "audio": self._parse_audio_data,
+            "image": self._parse_image_data,
+            "video": self._parse_video_data,
+        }
+
+    def parse_mm_data(self,
+                      mm_data: MultiModalDataDict) -> MultiModalDataItems:
+        subparsers = self._get_subparsers()
+
+        mm_items = MultiModalDataItems()
+        for k, v in mm_data.items():
+            if k not in subparsers:
+                raise ValueError(f"Unsupported modality: {k}")
+
+            mm_items[k] = subparsers[k](v)
+
+        return mm_items
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 3ece0762e3228..180489166b407 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -15,11 +15,12 @@ from transformers import BatchFeature, ProcessorMixin
 from vllm.inputs import DummyData, InputProcessingContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import LRUCache, flatten_2d_lists, full_groupby, is_list_of
+from vllm.utils import LRUCache, flatten_2d_lists, full_groupby
 
-from .inputs import (MultiModalDataDict, MultiModalDataItems,
-                     MultiModalFieldConfig, MultiModalFieldItem,
-                     MultiModalInputsV2, MultiModalKwargs, PlaceholderRange)
+from .inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                     MultiModalFieldItem, MultiModalInputsV2, MultiModalKwargs,
+                     PlaceholderRange)
+from .parse import MultiModalDataItems, MultiModalDataParser
 
 logger = init_logger(__name__)
 
@@ -621,6 +622,16 @@ class BaseMultiModalProcessor(ABC):
     ) -> MultiModalInputsV2:
         return self.apply(prompt, mm_data, hf_processor_mm_kwargs)
 
+    def _get_data_parser(self) -> MultiModalDataParser:
+        """
+        Construct a data parser to preprocess multi-modal data items
+        before passing them to :meth:`_get_hf_mm_data`.
+
+        You can support additional modalities by creating a subclass
+        of :class:`MultiModalDataParser` that has additional subparsers.
+        """
+        return MultiModalDataParser()
+
     def _get_hf_processor(self) -> ProcessorMixin:
         """
         Subclasses can add keyword arguments to this method to accept
@@ -631,11 +642,16 @@ class BaseMultiModalProcessor(ABC):
     def _get_tokenizer(self) -> AnyTokenizer:
         return self.ctx.tokenizer
 
-    def _get_mm_items(
+    def _to_mm_items(
         self,
         mm_data: MultiModalDataDict,
     ) -> MultiModalDataItems:
-        return MultiModalDataItems.from_dict(mm_data)
+        """
+        Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`
+        before passing them to :meth:`_get_hf_mm_data`.
+        """
+        parser = self._get_data_parser()
+        return parser.parse_mm_data(mm_data)
 
     @abstractmethod
     def _get_mm_fields_config(
@@ -680,22 +696,9 @@ class BaseMultiModalProcessor(ABC):
         processor_data = dict[str, Any]()
         passthrough_data = dict[str, Any]()
 
-        for k, v in mm_items.items():
-            # TODO: Make a separate modality for embedding inputs
-            # to avoid confusion
-            if k in ("image", "video", "audio"):
-                if isinstance(v, torch.Tensor) and v.ndim == 3:
-                    # Pass through embedding inputs (single)
-                    passthrough_data[f"{k}_embeds"] = [v]
-                elif (is_list_of(v, torch.Tensor) and len(v) > 0
-                      and v[0].ndim == 2):
-                    # Pass through embedding inputs (multi)
-                    passthrough_data[f"{k}_embeds"] = v
-                elif len(v) > 0:
-                    # Map keys to plural form, e.g.: image -> images
-                    processor_data[f"{k}s"] = v
-            else:
-                processor_data[k] = v
+        for items in mm_items.values():
+            processor_data.update(items.get_processor_data())
+            passthrough_data.update(items.get_passthrough_data())
 
         return processor_data, passthrough_data
 
@@ -756,7 +759,7 @@ class BaseMultiModalProcessor(ABC):
         cached items; instead, we rely on our own prompt replacement logic
         for the full text.
         """
-        mm_missing_counts = mm_missing_data_items.get_item_counts()
+        mm_missing_counts = mm_missing_data_items.get_all_counts()
 
         prompt_ids, _ = self._apply_hf_processor(
             prompt_text=prompt_text,
@@ -789,7 +792,8 @@ class BaseMultiModalProcessor(ABC):
         cache = self.cache
         model_id = self.ctx.model_config.model
 
-        if cache is None or mm_data_items.has_embedding_inputs():
+        _, passthrough_data = self._get_hf_mm_data(mm_data_items)
+        if cache is None or passthrough_data:
             return self._apply_hf_processor(
                 prompt_text=prompt_text,
                 mm_items=mm_data_items,
@@ -812,7 +816,7 @@ class BaseMultiModalProcessor(ABC):
             modality: [mm_data_items[modality][idx] for idx in idxs]
             for modality, idxs in mm_missing_idxs.items()
         }
-        mm_missing_data_items = self._get_mm_items(mm_missing_data)
+        mm_missing_data_items = self._to_mm_items(mm_missing_data)
 
         prompt_ids, mm_missing_kwargs = self._apply_hf_processor_missing(
             prompt_text=prompt_text,
@@ -852,7 +856,7 @@ class BaseMultiModalProcessor(ABC):
             mm_merged_field_items[modality] = merged_modal_items_lst
 
         if self.enable_sanity_checks:
-            mm_missing_counts = mm_missing_data_items.get_item_counts()
+            mm_missing_counts = mm_missing_data_items.get_all_counts()
             assert all(
                 item_count == mm_missing_counts[modality]
                 for modality, item_count in mm_missing_next_idx.items()), dict(
@@ -865,7 +869,7 @@ class BaseMultiModalProcessor(ABC):
         )
 
         if self.enable_sanity_checks:
-            mm_item_counts = mm_data_items.get_item_counts()
+            mm_item_counts = mm_data_items.get_all_counts()
 
             for modality, item_count in mm_item_counts.items():
                 for item_idx in range(item_count):
@@ -958,7 +962,7 @@ class BaseMultiModalProcessor(ABC):
         3. Extract information about the placeholder tokens from the
            processed token IDs.
         """
-        mm_items = self._get_mm_items(mm_data)
+        mm_items = self._to_mm_items(mm_data)
 
         prompt_ids, mm_kwargs = self._cached_apply_hf_processor(
             prompt_text,
@@ -975,7 +979,7 @@ class BaseMultiModalProcessor(ABC):
 
         # If HF processor already inserts placeholder tokens,
         # there is no need for us to insert them
-        mm_item_counts = mm_items.get_item_counts()
+        mm_item_counts = mm_items.get_all_counts()
         all_placeholders = self._find_placeholders(prompt_repls, prompt_ids,
                                                    mm_item_counts)
 
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index b7d43c830cc46..1ad1f5abc27a2 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -15,7 +15,7 @@ from vllm.transformers_utils.processor import get_video_processor
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils import PlaceholderModule, is_list_of
 
-from .base import MediaIO, MultiModalData
+from .base import MediaIO, ModalityData
 from .image import ImageMediaIO, ImagePlugin
 from .inputs import MultiModalKwargs, VideoItem
 
@@ -54,7 +54,7 @@ class VideoPlugin(ImagePlugin):
     def _default_input_mapper(
         self,
         ctx: InputContext,
-        data: MultiModalData[VideoItem],
+        data: ModalityData[VideoItem],
         **mm_processor_kwargs,
     ) -> MultiModalKwargs:
         model_config = ctx.model_config

From 5886aa496e8fa31c9180bcfc8e89faaa8899907d Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Mon, 30 Dec 2024 10:51:02 -0500
Subject: [PATCH 34/48] [V1] [6/N] API Server: Better Shutdown (#11586)

---
 vllm/entrypoints/openai/api_server.py | 44 ++++++++-------------------
 vllm/v1/engine/async_llm.py           | 25 +++++++++++++--
 vllm/v1/engine/core_client.py         | 16 ++++------
 3 files changed, 40 insertions(+), 45 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 094cc15a317e9..bac72d87376da 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -68,7 +68,7 @@ from vllm.entrypoints.utils import with_cancellation
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
-                        is_valid_ipv6_address, kill_process_tree, set_ulimit)
+                        is_valid_ipv6_address, set_ulimit)
 from vllm.version import __version__ as VLLM_VERSION
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
@@ -133,32 +133,21 @@ async def build_async_engine_client_from_engine_args(
     Returns the Client or None if the creation failed.
     """
 
-    # Fall back
-    # TODO: fill out feature matrix.
+    # AsyncLLMEngine.
     if (MQLLMEngineClient.is_unsupported_config(engine_args)
             or envs.VLLM_USE_V1 or disable_frontend_multiprocessing):
-        engine_config = engine_args.create_engine_config(
-            UsageContext.OPENAI_API_SERVER)
-        uses_ray = getattr(AsyncLLMEngine._get_executor_cls(engine_config),
-                           "uses_ray", False)
 
-        build_engine = partial(AsyncLLMEngine.from_engine_args,
-                               engine_args=engine_args,
-                               engine_config=engine_config,
-                               usage_context=UsageContext.OPENAI_API_SERVER)
-        if uses_ray:
-            # Must run in main thread with ray for its signal handlers to work
-            engine_client = build_engine()
-        else:
-            engine_client = await asyncio.get_running_loop().run_in_executor(
-                None, build_engine)
+        engine_client: Optional[EngineClient] = None
+        try:
+            engine_client = AsyncLLMEngine.from_engine_args(
+                engine_args=engine_args,
+                usage_context=UsageContext.OPENAI_API_SERVER)
+            yield engine_client
+        finally:
+            if engine_client and hasattr(engine_client, "shutdown"):
+                engine_client.shutdown()
 
-        yield engine_client
-        if hasattr(engine_client, "shutdown"):
-            engine_client.shutdown()
-        return
-
-    # Otherwise, use the multiprocessing AsyncLLMEngine.
+    # MQLLMEngine.
     else:
         if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
             # Make TemporaryDirectory for prometheus multiprocessing
@@ -737,15 +726,6 @@ async def run_server(args, **uvicorn_kwargs) -> None:
 
     signal.signal(signal.SIGTERM, signal_handler)
 
-    # The child processes will send SIGQUIT to this process when
-    # any error happens. This process then clean up the whole tree.
-    # TODO(rob): move this into AsyncLLM.__init__ once we remove
-    # the context manager below.
-    def sigquit_handler(signum, frame):
-        kill_process_tree(os.getpid())
-
-    signal.signal(signal.SIGQUIT, sigquit_handler)
-
     async with build_async_engine_client(args) as engine_client:
         app = build_app(args)
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 213ddaa023dbc..3f097ca7f439c 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,4 +1,6 @@
 import asyncio
+import os
+import signal
 from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
 
 from vllm.config import ModelConfig, VllmConfig
@@ -16,6 +18,7 @@ from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
+from vllm.utils import kill_process_tree
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.processor import Processor
@@ -38,6 +41,22 @@ class AsyncLLM(EngineClient):
         log_requests: bool = True,
         start_engine_loop: bool = True,
     ) -> None:
+
+        # The child processes will send SIGQUIT when unrecoverable
+        # errors happen. We kill the process tree here so that the
+        # stack trace is very evident.
+        # TODO: rather than killing the main process, we should
+        # figure out how to raise an AsyncEngineDeadError and
+        # handle at the API server level so we can return a better
+        # error code to the clients calling VLLM.
+        def sigquit_handler(signum, frame):
+            logger.fatal(
+                "AsyncLLM got SIGQUIT from worker processes, shutting "
+                "down. See stack trace above for root cause issue.")
+            kill_process_tree(os.getpid())
+
+        signal.signal(signal.SIGQUIT, sigquit_handler)
+
         assert start_engine_loop
 
         self.log_requests = log_requests
@@ -276,9 +295,9 @@ class AsyncLLM(EngineClient):
                 # 4) Abort any requests that finished due to stop strings.
                 await self.engine_core.abort_requests_async(reqs_to_abort)
 
-        except BaseException as e:
-            logger.error(e)
-            raise e
+        except Exception as e:
+            logger.exception("EngineCore output handler hit an error: %s", e)
+            kill_process_tree(os.getpid())
 
     async def abort(self, request_id: str) -> None:
         """Abort RequestId in self, detokenizer, and engine core."""
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index beb5d57c20c83..3293205e110af 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -6,7 +6,7 @@ import zmq.asyncio
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import get_open_zmq_ipc_path
+from vllm.utils import get_open_zmq_ipc_path, make_zmq_socket
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
                             EngineCoreRequestType, EngineCoreRequestUnion)
@@ -144,17 +144,13 @@ class MPClient(EngineCoreClient):
         else:
             self.ctx = zmq.Context()  # type: ignore[attr-defined]
 
-        # Path for IPC.
+        # Paths and sockets for IPC.
         output_path = get_open_zmq_ipc_path()
         input_path = get_open_zmq_ipc_path()
-
-        # Get output (EngineCoreOutput) from EngineCore.
-        self.output_socket = self.ctx.socket(zmq.constants.PULL)
-        self.output_socket.connect(output_path)
-
-        # Send input (EngineCoreRequest) to EngineCore.
-        self.input_socket = self.ctx.socket(zmq.constants.PUSH)
-        self.input_socket.bind(input_path)
+        self.output_socket = make_zmq_socket(self.ctx, output_path,
+                                             zmq.constants.PULL)
+        self.input_socket = make_zmq_socket(self.ctx, input_path,
+                                            zmq.constants.PUSH)
 
         # Start EngineCore in background process.
         self.proc_handle: Optional[BackgroundProcHandle]

From 36e76700453924c8d421db99af70a88a1df835cd Mon Sep 17 00:00:00 2001
From: whyiug <whyiug@hotmail.com>
Date: Tue, 31 Dec 2024 02:51:04 +0800
Subject: [PATCH 35/48] [Bugfix] Validate and concatenate image embeddings in
 MiniCPMVBaseModel (#11631)

---
 vllm/model_executor/models/minicpmv.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 1e8f9bd4cf418..712022502539b 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -487,6 +487,12 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
         image_embeds = kwargs.pop("image_embeds", None)
 
         if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError(f"Incorrect type of image embeds. "
+                                 f"Got type: {type(image_embeds)}")
+            if isinstance(image_embeds, list):
+                image_embeds = torch.concat(image_embeds)
+
             return MiniCPMVImageEmbeddingInputs(
                 image_bounds=self._get_image_bounds(input_ids, im_start_id,
                                                     im_end_id, slice_start_id,

From ccb1aabccaa7aaf07b08fd8be30380e828efba0f Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Mon, 30 Dec 2024 12:27:07 -0800
Subject: [PATCH 36/48] [benchmark] Remove dependency for H100 benchmark step
 (#11572)

---
 .buildkite/nightly-benchmarks/benchmark-pipeline.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
index 708e548727cf5..868b8e95db01d 100644
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -73,7 +73,7 @@ steps:
     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
     agents:
       queue: H100
-    depends_on: block-h100
+    depends_on: ~
     plugins:
     - docker#v5.12.0:
         image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT

From a2a40bcd0d8275e19c46e9cc06ee994d8839b98d Mon Sep 17 00:00:00 2001
From: Matthias Vogler <60004995+ayylemao@users.noreply.github.com>
Date: Tue, 31 Dec 2024 02:33:06 +0100
Subject: [PATCH 37/48] [Model][LoRA]LoRA support added for MolmoForCausalLM
 (#11439)

Signed-off-by: Matthias Vogler <matthias.vogler@joesecurity.org>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Matthias Vogler <matthias.vogler@joesecurity.org>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 docs/source/models/supported_models.md |  2 +-
 vllm/model_executor/models/molmo.py    | 45 ++++++++++++++++++++++++--
 2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 518505abeb2a9..613343281464c 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -666,7 +666,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - Molmo
   - T + I
   - `allenai/Molmo-7B-D-0924`, `allenai/Molmo-72B-0924`, etc.
-  -
+  - ✅︎
   - ✅︎
   - ✅︎
 * - `NVLM_D_Model`
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 5d52d2c3e6b48..cc25be9f5b6a9 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -36,6 +36,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
 from vllm.multimodal.utils import cached_get_tokenizer
@@ -43,7 +44,7 @@ from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
                            SequenceData)
 from vllm.transformers_utils.processor import get_processor
 
-from .interfaces import SupportsMultiModal, SupportsPP
+from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
@@ -1161,8 +1162,8 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs):
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_molmo_image_tokens)
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_molmo)
 @INPUT_REGISTRY.register_input_processor(input_processor_for_molmo)
-class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
-
+class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
+                       SupportsLoRA):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={
             # vision backbone mapping
@@ -1191,6 +1192,32 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         },
     )
 
+    packed_modules_mapping = {
+        "qkv_proj": ["qkv_proj"],
+        "gate_up_proj": ["gate_up_proj"],  # language model
+        "merged_linear": ["gate_proj", "up_proj"]  # image_projector
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        # language model
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",  # same name with image_projector
+        # vision tower
+        "wq",
+        "wk",
+        "wv",
+        "wo",
+        "w1",
+        "w2",
+        # image_projector
+        "merged_linear",
+    ]
+    embedding_modules = {}
+    embedding_padding_modules = []
+
     # BitandBytes specific attributes
     bitsandbytes_stacked_params_mapping = {
         "gate_proj": ("merged_linear", 0),
@@ -1202,8 +1229,10 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
+        lora_config = vllm_config.lora_config
         self.config = config
         self.multimodal_config = multimodal_config
+        self.lora_config = lora_config
 
         vision_config = VisionBackboneConfig()
         self.vision_backbone = MolmoVisionBackbone(config, vision_config,
@@ -1377,6 +1406,16 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         weights = _get_weights_with_merged_embedding(weights)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="model",
+            connector="vision_backbone.image_projector",
+            tower_model="vision_backbone",
+        )
+
 
 def _get_weights_with_merged_embedding(
     weights: Iterable[Tuple[str, torch.Tensor]]

From 74fa1d123c2818065d862d2ceb2338468914fa79 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Mon, 30 Dec 2024 22:43:54 -0500
Subject: [PATCH 38/48] [Bugfix] Fix OpenAI parallel sampling when using
 xgrammar (#11637)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 tests/entrypoints/openai/test_completion.py        | 14 ++++++--------
 .../guided_decoding/xgrammar_decoding.py           |  5 +++++
 vllm/sampling_params.py                            |  9 +++++----
 vllm/sequence.py                                   |  2 +-
 4 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index c81cfdbbe5cff..183d900c493e5 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -28,6 +28,8 @@ PA_NAME = "swapnilbp/llama_tweet_ptune"
 # need to change to match the prompt adapter
 PA_NUM_VIRTUAL_TOKENS = 8
 
+GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
+
 
 @pytest.fixture(scope="module")
 def zephyr_lora_files():
@@ -635,8 +637,7 @@ async def test_allowed_token_ids(client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_json_completion(client: openai.AsyncOpenAI,
                                       guided_decoding_backend: str,
                                       sample_json_schema):
@@ -658,8 +659,7 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_regex_completion(client: openai.AsyncOpenAI,
                                        guided_decoding_backend: str,
                                        sample_regex):
@@ -680,8 +680,7 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_completion(client: openai.AsyncOpenAI,
                                         guided_decoding_backend: str,
                                         sample_guided_choice):
@@ -761,8 +760,7 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
                                           guided_decoding_backend: str,
                                           sample_json_schema, sample_regex):
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index 5e1948977bff4..f10a8fb8e03cf 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -1,6 +1,7 @@
 # noqa: UP007
 from __future__ import annotations
 
+import copy
 import json
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any
@@ -309,3 +310,7 @@ class XGrammarLogitsProcessor:
             scores = scores.to(device_type).squeeze()
 
         return scores
+
+    def clone(self) -> XGrammarLogitsProcessor:
+        """Deepcopy due to per-sequence state in the matchers"""
+        return copy.deepcopy(self)
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index fc77f3ca529b2..605c09b8d7225 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -450,15 +450,16 @@ class SamplingParams(
         return self._all_stop_token_ids
 
     def clone(self) -> "SamplingParams":
-        """Deep copy excluding LogitsProcessor objects.
+        """Deep copy, but maybe not the LogitsProcessor objects.
 
-        LogitsProcessor objects are excluded because they may contain an
-        arbitrary, nontrivial amount of data.
+        LogitsProcessor objects may contain an arbitrary, nontrivial amount of
+        data that is expensive to copy. However, if not copied, the processor
+        needs to support parallel decoding for multiple sequences
         See https://github.com/vllm-project/vllm/issues/3087
         """
 
         logit_processor_refs = None if self.logits_processors is None else {
-            id(lp): lp
+            id(lp): lp.clone() if hasattr(lp, 'clone') else lp
             for lp in self.logits_processors
         }
         return copy.deepcopy(self, memo=logit_processor_refs)
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 34f910d47b7d9..034f89c0ddbe9 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1372,7 +1372,7 @@ class ParallelSampleSequenceGroup(SequenceGroupBase):
     @staticmethod
     def add_request(request_id: str, engine, params, **kwargs):
         original_params = params
-        params = copy.deepcopy(original_params)
+        params = original_params.clone()
         params.n = 1
         group = ParallelSampleSequenceGroup(request_id)
         seqs = []

From 82c49d3260f1fb9fcd686736e8439dc69cd2f1c4 Mon Sep 17 00:00:00 2001
From: John Giorgi <johnmgiorgi@gmail.com>
Date: Tue, 31 Dec 2024 01:15:58 -0500
Subject: [PATCH 39/48] [Misc][LoRA] Support Rank Stabilized LoRA (RSLoRA)
 (#6909)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_lora_manager.py | 20 +++++++++++++-------
 vllm/lora/lora.py               | 12 +++---------
 vllm/lora/models.py             |  2 +-
 vllm/lora/peft_helper.py        | 18 +++++++++++++-----
 4 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 0b76f466702fc..a099f36b0a465 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -1,4 +1,5 @@
 import json
+import math
 import os
 from typing import Dict, List
 
@@ -50,6 +51,18 @@ def test_peft_helper(sql_lora_files):
         "embed_tokens",
         "lm_head",
     ]
+    scaling = peft_helper.lora_alpha / peft_helper.r
+    assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
+
+    # test RSLoRA
+    config = dict(r=8,
+                  lora_alpha=16,
+                  target_modules=["gate_proj"],
+                  use_rslora=True)
+    peft_helper = PEFTHelper.from_dict(config)
+
+    scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r)
+    assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
 
     expected_error = "vLLM only supports modules_to_save being None."
     with pytest.raises(ValueError, match=expected_error):
@@ -60,13 +73,6 @@ def test_peft_helper(sql_lora_files):
             modules_to_save=["lm_head"],
         )
         PEFTHelper.from_dict(config)
-    expected_error = "vLLM does not yet support RSLoRA."
-    with pytest.raises(ValueError, match=expected_error):
-        config = dict(r=8,
-                      lora_alpha=16,
-                      target_modules=["gate_proj"],
-                      use_rslora=True)
-        PEFTHelper.from_dict(config)
 
     expected_error = "vLLM does not yet support DoRA."
     with pytest.raises(ValueError, match=expected_error):
diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py
index dde347b78bf81..93ad4651f4b77 100644
--- a/vllm/lora/lora.py
+++ b/vllm/lora/lora.py
@@ -67,15 +67,9 @@ class LoRALayerWeights:
         peft_helper: PEFTHelper,
         embeddings_tensor: Optional[torch.Tensor] = None,
     ) -> "LoRALayerWeights":
-        return cls(
-            module_name,
-            peft_helper.r,
-            peft_helper.lora_alpha,
-            None,
-            None,
-            None,
-            embeddings_tensor,
-        )
+        return cls(module_name, peft_helper.r, peft_helper.lora_alpha, None,
+                   None, None, embeddings_tensor,
+                   peft_helper.vllm_lora_scaling_factor)
 
     @classmethod
     def create_dummy_lora_weights(
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 5c0e4e5cbc636..9cfcc6bba727f 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -173,7 +173,7 @@ class LoRAModel(AdapterModel):
         return cls(lora_model_id,
                    peft_helper.r,
                    loras,
-                   scaling_factor=peft_helper.vllm_scaling_factor)
+                   scaling_factor=peft_helper.vllm_long_context_scaling_factor)
 
     @classmethod
     def from_local_checkpoint(
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
index edf4ba5659575..ddd42ae93d290 100644
--- a/vllm/lora/peft_helper.py
+++ b/vllm/lora/peft_helper.py
@@ -4,6 +4,8 @@ import math
 from dataclasses import MISSING, dataclass, field, fields
 from typing import Literal, Optional, Union
 
+from vllm.utils import print_info_once
+
 
 @dataclass
 class PEFTHelper:
@@ -14,21 +16,22 @@ class PEFTHelper:
 
     bias: Literal["none", "all", "lora_only"] = field(default="none")
     modules_to_save: Optional[list[str]] = field(default=None)
+    # True to use Rank-Stabilized LoRA (rsLoRA, see: https://arxiv.org/abs/2312.03732)
     use_rslora: bool = field(default=False)
+    # True to use Weight-Decomposed Low-Rank Adaptation (DoRA, see: https://arxiv.org/abs/2402.09353)
     use_dora: bool = field(default=False)
-    # long lora field
+    # long context lora field
     context_length: int = field(default=0)
     # Extra vllm field, start with 'vllm_' to avoid conflict
+    vllm_lora_scaling_factor: float = field(default=1.0)
     vllm_max_position_embeddings: Optional[int] = field(default=False)
-    vllm_scaling_factor: Optional[float] = field(default=None)
+    vllm_long_context_scaling_factor: Optional[float] = field(default=None)
 
     def _validate_features(self):
         error_msg = []
 
         if self.modules_to_save:
             error_msg.append("vLLM only supports modules_to_save being None.")
-        if self.use_rslora:
-            error_msg.append("vLLM does not yet support RSLoRA.")
 
         if self.use_dora:
             error_msg.append("vLLM does not yet support DoRA.")
@@ -38,10 +41,15 @@ class PEFTHelper:
 
     def __post_init__(self):
         self._validate_features()
+        if self.use_rslora:
+            print_info_once("Loading LoRA weights trained with rsLoRA.")
+            self.vllm_lora_scaling_factor = self.lora_alpha / math.sqrt(self.r)
+        else:
+            self.vllm_lora_scaling_factor = self.lora_alpha / self.r
         if self.context_length:
             if self.vllm_max_position_embeddings is None:
                 self.vllm_max_position_embeddings = self.context_length
-            self.vllm_scaling_factor = float(
+            self.vllm_long_context_scaling_factor = float(
                 math.ceil(self.context_length /
                           self.vllm_max_position_embeddings))
 

From 2c5718809bb5f4bce2ae8e05041d613215dac1aa Mon Sep 17 00:00:00 2001
From: sakunkun <zhou.qianjun@zte.com.cn>
Date: Tue, 31 Dec 2024 14:29:04 +0800
Subject: [PATCH 40/48] [Bugfix] Move the _touch(computed_blocks) call in the
 allocate_slots method to after the check for allocating new blocks. (#11565)

---
 tests/v1/core/test_prefix_caching.py | 63 +++++++++++++++++++++++++++-
 vllm/v1/core/kv_cache_manager.py     | 19 ++++++---
 2 files changed, 74 insertions(+), 8 deletions(-)

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index ed04f0a373c51..dafaa6aee9995 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -98,9 +98,9 @@ def test_prefill():
     # Incomplete 1 block (6 tokens)
     unique_token_ids = [3] * 6
     req2 = make_request("2", common_token_ids + unique_token_ids)
-    computed_block = manager.get_computed_blocks(req2)
+    computed_blocks = manager.get_computed_blocks(req2)
     assert len(req2.kv_block_hashes) == 3
-    assert [b.block_id for b in computed_block] == [0, 1, 2]
+    assert [b.block_id for b in computed_blocks] == [0, 1, 2]
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
     assert [b.block_id for b in blocks] == [7, 8]
@@ -500,3 +500,62 @@ def test_mm_prefix_caching():
                         mm_hashes=mm_hashes)
     computed_blocks = manager.get_computed_blocks(req1)
     assert len(computed_blocks) == 3
+
+
+def test_prefill_not_enough_free_blocks_with_computed_blocks():
+    """
+    This is a unit test that tests the correctness of the allocate_slots
+    when there is not enough free blocks. Specifically, when a request
+    has computed blocks but cannot be allocated due to not enough free blocks,
+    the computed blocks should not be touched.
+    """
+    block_size = 16
+    manager = KVCacheManager(
+        block_size=block_size,
+        num_gpu_blocks=10,
+        max_model_len=8192,
+        sliding_window=None,
+        enable_caching=True,
+        num_preallocate_tokens=0,
+    )
+    # Complete 3 blocks (48 tokens)
+    # | Common-0 | Common-1 | Common-2 | ... |
+    common_token_ids = [i for i in range(3) for _ in range(16)]
+    req0 = make_request("0", common_token_ids)
+    computed_blocks = manager.get_computed_blocks(req0)
+    assert not computed_blocks
+    manager.allocate_slots(req0, 48, computed_blocks)
+    block_part0 = manager.req_to_blocks[req0.request_id]
+
+    # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
+    req1 = make_request("1", common_token_ids * 2)
+    computed_blocks = manager.get_computed_blocks(req1)
+    assert computed_blocks == block_part0
+    manager.allocate_slots(req1, 48, computed_blocks)
+    block_part1 = manager.req_to_blocks[req1.request_id]
+    # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
+    # | Req1-5(F)| ... |
+    manager.free(req1)
+    assert {block.ref_cnt for block in block_part1[:3]} == {1}
+    assert {block.ref_cnt for block in block_part1[3:]} == {0}
+
+    # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
+    # | Req1-5(F)| Req2-0   | Req2-1   | ... |
+    req2 = make_request("2", [7] * block_size * 2)
+    computed_blocks = manager.get_computed_blocks(req2)
+    assert not computed_blocks
+    manager.allocate_slots(req2, block_size * 2, computed_blocks)
+
+    # Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed,
+    # but it cannot be allocated due to insufficient free blocks (2).
+    # In this case, the ref_cnt of the computed blocks should not be changed.
+    assert manager.free_block_queue.num_free_blocks == 5
+    req3 = make_request("3", common_token_ids * 3)
+    computed_blocks = manager.get_computed_blocks(req3)
+    assert computed_blocks == block_part1
+    # Req3 cannot be allocated.
+    assert manager.allocate_slots(req3, 48, computed_blocks) is None
+    # Block 0-2 are used by Req 1.
+    assert {block.ref_cnt for block in block_part1[:3]} == {1}
+    # Block 3-5 are free.
+    assert {block.ref_cnt for block in block_part1[3:]} == {0}
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 78efacccfa078..00d0de51634ae 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -191,7 +191,7 @@ class KVCacheManager:
             request: The request to allocate slots.
             num_tokens: The number of tokens to allocate. Note that this does
                 not include the tokens that have already been computed.
-            computed_blocks: The blocks that have already been computed.
+            computed_blocks: A list of computed blocks.
 
         Returns:
             A list of new allocated blocks.
@@ -200,6 +200,18 @@ class KVCacheManager:
             raise ValueError(
                 f"num_tokens must be greater than 0, got {num_tokens}")
 
+        # If a computed block of a request is an eviction candidate (in the
+        # free queue and ref_cnt == 0), it cannot be counted as a free block
+        # when allocating this request.
+        num_evictable_computed_blocks = sum(1 for blk in computed_blocks
+                                            if blk.ref_cnt == 0)
+
+        num_required_blocks = cdiv(num_tokens, self.block_size)
+        if (num_required_blocks > self.free_block_queue.num_free_blocks -
+                num_evictable_computed_blocks):
+            # Cannot allocate new blocks.
+            return None
+
         # Touch the computed blocks to make sure they won't be evicted.
         if self.enable_caching:
             self._touch(computed_blocks)
@@ -208,11 +220,6 @@ class KVCacheManager:
                 "Computed blocks should be empty when "
                 "prefix caching is disabled")
 
-        num_required_blocks = cdiv(num_tokens, self.block_size)
-        if (num_required_blocks > self.free_block_queue.num_free_blocks):
-            # Cannot allocate new blocks.
-            return None
-
         # Determine the number of new blocks to allocate considering
         # preallocated blocks.
         num_new_blocks = min(

From 8c3230d8c1cf114618c2316c54bf06b7d0c198b6 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 31 Dec 2024 16:56:01 +0800
Subject: [PATCH 41/48] [V1] Simpify vision block hash for prefix caching by
 removing offset from hash (#11646)

---
 tests/v1/core/test_prefix_caching.py | 8 ++++----
 vllm/v1/core/kv_cache_utils.py       | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index dafaa6aee9995..35e3a2f972720 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -469,9 +469,9 @@ def test_mm_prefix_caching():
     # Completed block should have hashes with extra keys.
     assert not computed_blocks
     assert len(req0.kv_block_hashes) == 3
-    assert req0.kv_block_hashes[0].extra_keys == (("aaa", 0), )
-    assert req0.kv_block_hashes[1].extra_keys == (("aaa", 5), ("bbb", 0))
-    assert req0.kv_block_hashes[2].extra_keys == (("bbb", 2), )
+    assert req0.kv_block_hashes[0].extra_keys == ("aaa", )
+    assert req0.kv_block_hashes[1].extra_keys == ("aaa", "bbb")
+    assert req0.kv_block_hashes[2].extra_keys == ("bbb", )
 
     blocks = manager.allocate_slots(req0, 59, computed_blocks)
     assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
@@ -485,7 +485,7 @@ def test_mm_prefix_caching():
 
     # The just completed block should have hashes with extra keys.
     assert len(req0.kv_block_hashes) == 4
-    assert req0.kv_block_hashes[3].extra_keys == (("ccc", 0), )
+    assert req0.kv_block_hashes[3].extra_keys == ("ccc", )
 
     # Cache hit.
     unique_token_ids = [-1] * 7 + [200] * 5
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 9ddbff7c9a604..84ff48bf428a0 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -218,8 +218,8 @@ def generate_block_hash_extra_keys(
                 continue
 
             # The block contains the current mm input.
-            mm_start = max(0, start_token_idx - offset)
-            extra_keys.append((mm_hashes[curr_mm_idx], mm_start))
+            extra_keys.append(mm_hashes[curr_mm_idx])
+
             if end_token_idx >= offset + length:
                 # If this block contains the end of the current mm input,
                 # move to the next mm input as this block may also contain

From e7c7c5e822a886e3dba202ca1b756c3260efffcc Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 31 Dec 2024 13:17:22 -0800
Subject: [PATCH 42/48] [V1][VLM] V1 support for selected single-image models.
 (#11632)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Isotr0py <2037008807@qq.com>
---
 docs/source/models/supported_models.md        |  10 +-
 examples/offline_inference_vision_language.py |  10 +-
 .../vision_language/test_models.py            |   7 +-
 tests/multimodal/test_processing.py           |  29 +-
 vllm/model_executor/models/aria.py            | 173 ++++-----
 vllm/model_executor/models/blip.py            |  92 -----
 vllm/model_executor/models/blip2.py           | 176 +++++----
 vllm/model_executor/models/chameleon.py       | 173 ++++-----
 vllm/model_executor/models/fuyu.py            | 361 +++++++++---------
 .../models/idefics2_vision_model.py           |   6 +-
 vllm/model_executor/models/llava.py           |   4 +-
 vllm/model_executor/models/llava_next.py      |   6 +-
 vllm/model_executor/models/pixtral.py         |  12 +-
 vllm/model_executor/models/qwen2_audio.py     |  14 +-
 vllm/model_executor/models/qwen2_vl.py        |  17 +-
 vllm/model_executor/models/ultravox.py        |  13 +-
 vllm/multimodal/processing.py                 |  68 +++-
 vllm/multimodal/utils.py                      |  10 +-
 vllm/v1/worker/gpu_model_runner.py            |  15 +-
 19 files changed, 575 insertions(+), 621 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 613343281464c..f74c201bdff6b 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -570,28 +570,28 @@ See [this page](#generative-models) for more information on how to use generativ
   - `rhymes-ai/Aria`
   -
   - ✅︎
-  -
+  - ✅︎
 * - `Blip2ForConditionalGeneration`
   - BLIP-2
   - T + I<sup>E</sup>
   - `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc.
   -
   - ✅︎
-  -
+  - ✅︎
 * - `ChameleonForConditionalGeneration`
   - Chameleon
   - T + I
   - `facebook/chameleon-7b` etc.
   -
   - ✅︎
-  -
+  - ✅︎
 * - `FuyuForCausalLM`
   - Fuyu
   - T + I
   - `adept/fuyu-8b` etc.
   -
   - ✅︎
-  -
+  - ✅︎
 * - `ChatGLMModel`
   - GLM-4V
   - T + I
@@ -633,7 +633,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
   -
   - ✅︎
-  -
+  - ✅︎
 * - `LlavaNextVideoForConditionalGeneration`
   - LLaVA-NeXT-Video
   - T + V
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 77af914a6ef02..b51bfae455267 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -24,10 +24,13 @@ def run_aria(question: str, modality: str):
     assert modality == "image"
     model_name = "rhymes-ai/Aria"
 
+    # NOTE: Need L40 (or equivalent) to avoid OOM
     llm = LLM(model=model_name,
               tokenizer_mode="slow",
-              trust_remote_code=True,
               dtype="bfloat16",
+              max_model_len=4096,
+              max_num_seqs=2,
+              trust_remote_code=True,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
 
     prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
@@ -57,6 +60,7 @@ def run_chameleon(question: str, modality: str):
     prompt = f"{question}<image>"
     llm = LLM(model="facebook/chameleon-7b",
               max_model_len=4096,
+              max_num_seqs=2,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
@@ -257,7 +261,7 @@ def run_minicpmv(question: str, modality: str):
     # 2.5
     # model_name = "openbmb/MiniCPM-Llama3-V-2_5"
 
-    #2.6
+    # 2.6
     model_name = "openbmb/MiniCPM-V-2_6"
     tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
@@ -430,9 +434,11 @@ def run_pixtral_hf(question: str, modality: str):
 
     model_name = "mistral-community/pixtral-12b"
 
+    # NOTE: Need L40 (or equivalent) to avoid OOM
     llm = LLM(
         model=model_name,
         max_model_len=8192,
+        max_num_seqs=2,
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 1a9c1b4ef1be0..7db08166826eb 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -140,10 +140,7 @@ VLM_TEST_SETTINGS = {
     "aria": VLMTestInfo(
         models=["rhymes-ai/Aria"],
         tokenizer_mode="slow",
-        test_type=(
-            VLMTestType.IMAGE,
-            VLMTestType.MULTI_IMAGE,
-        ),
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         dtype="bfloat16",
         prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
         img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
@@ -179,6 +176,7 @@ VLM_TEST_SETTINGS = {
         test_type=VLMTestType.IMAGE,
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         max_model_len=4096,
+        max_num_seqs=2,
         auto_cls=AutoModelForVision2Seq,
         postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values"
@@ -201,7 +199,6 @@ VLM_TEST_SETTINGS = {
         vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
         num_logprobs=10,
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
-        marks=[large_gpu_mark(min_gb=48)],
     ),
     "glm4": VLMTestInfo(
         models=["THUDM/glm-4v-9b"],
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 1b2847ed0f534..81278cde264ff 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -528,7 +528,7 @@ def _rand_audio(
 
 def _test_processing_cache_correctness(
     model_id: str,
-    modalities: set[str],
+    modalities: dict[str, bool],
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
@@ -583,9 +583,8 @@ def _test_processing_cache_correctness(
         partial(_rand_audio, rng, min_len=256, max_len=512, sr=16000),
     }
     input_max_count = {
-        "image": 3,
-        "video": 3,
-        "audio": 3,
+        modality: 3 if supports_multi else 1
+        for modality, supports_multi in modalities.items()
     }
 
     for batch_idx in range(num_batches):
@@ -624,12 +623,16 @@ def _test_processing_cache_correctness(
 
 # yapf: disable
 @pytest.mark.parametrize(("model_id", "modalities"), [
-    ("llava-hf/llava-1.5-7b-hf", {"image"}),
-    ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image"}),
-    ("mistral-community/pixtral-12b", {"image"}),
-    ("Qwen/Qwen2-VL-2B-Instruct", {"image", "video"}),
-    ("Qwen/Qwen2-Audio-7B-Instruct", {"audio"}),
-    ("fixie-ai/ultravox-v0_3", {"audio"}),
+    ("rhymes-ai/Aria", {"image": True}),
+    ("Salesforce/blip2-opt-2.7b", {"image": False}),
+    ("facebook/chameleon-7b", {"image": True}),
+    ("adept/fuyu-8b", {"image": False}),
+    ("llava-hf/llava-1.5-7b-hf", {"image": True}),
+    ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
+    ("mistral-community/pixtral-12b", {"image": True}),
+    ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}),
+    ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}),
+    ("fixie-ai/ultravox-v0_3", {"audio": True}),
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
@@ -637,7 +640,7 @@ def _test_processing_cache_correctness(
 # yapf: enable
 def test_processing_cache_correctness(
     model_id: str,
-    modalities: set[str],
+    modalities: dict[str, bool],
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
@@ -653,7 +656,7 @@ def test_processing_cache_correctness(
 
 # yapf: disable
 @pytest.mark.parametrize(("model_id", "modalities"), [
-    ("microsoft/Phi-3-vision-128k-instruct", {"image"}),
+    ("microsoft/Phi-3-vision-128k-instruct", {"image": True}),
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
@@ -661,7 +664,7 @@ def test_processing_cache_correctness(
 # yapf: enable
 def test_processing_cache_correctness_phi3v(
     model_id: str,
-    modalities: set[str],
+    modalities: dict[str, bool],
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 9437ad9688422..4ad6e859f4d93 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -1,15 +1,15 @@
-import math
-from typing import Iterable, List, Optional, Set, Tuple, TypedDict, Union
+from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
+                    Union)
 
 import torch
 import torch.nn as nn
 from torch.nn.init import trunc_normal_
-from transformers import LlamaConfig
+from transformers import BatchFeature, PretrainedConfig
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, QuantizationConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_rank
-from vllm.inputs import INPUT_REGISTRY, token_inputs
+from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -17,30 +17,27 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     get_compressed_tensors_cache_scale)
-from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput,
-                                                SamplingMetadata)
+from vllm.model_executor.layers.sampler import (SamplerOutput,
+                                                SamplingMetadata, get_sampler)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
-from vllm.model_executor.models.idefics2_vision_model import (
-    Idefics2VisionTransformer)
-from vllm.model_executor.models.interfaces import SupportsMultiModal
-from vllm.model_executor.models.llama import (LlamaDecoderLayer, LlamaMLP,
-                                              LlamaModel)
-from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper,
-                                              is_pp_missing_parameter,
-                                              maybe_prefix,
-                                              merge_multimodal_embeddings)
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import cached_get_image_processor
-from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   repeat_and_pad_placeholder_tokens)
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.aria import (AriaMoELMConfig,
                                                   AriaVisionConfig)
 
-from .utils import flatten_bn
+from .idefics2_vision_model import Idefics2VisionTransformer
+from .interfaces import SupportsMultiModal
+from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    is_pp_missing_parameter, maybe_prefix,
+                    merge_multimodal_embeddings)
 
 
 class AriaImagePixelInputs(TypedDict):
@@ -251,7 +248,7 @@ class AriaProjector(nn.Module):
 class AriaFusedMoE(FusedMoE):
 
     def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
-                      shard_id: str) -> Set[str]:
+                      shard_id: str) -> None:
         # Override the weight_loader to handle the expert weights in the Aria
         # model, which are already packed with experts, and merge the gate and
         # up weights for each expert.
@@ -346,7 +343,7 @@ class MoEDecoderLayer(LlamaDecoderLayer):
 
     def __init__(
         self,
-        config: LlamaConfig,
+        config: AriaMoELMConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -434,7 +431,7 @@ class AriaMoELMModel(LlamaModel):
         return loaded_params
 
 
-def build_mm_projector(config):
+def build_mm_projector(config: PretrainedConfig):
     return AriaProjector(
         patch_to_query_dict=config.projector_patch_to_query_dict,
         embed_dim=config.vision_config.hidden_size,
@@ -445,75 +442,70 @@ def build_mm_projector(config):
     )
 
 
-def get_max_multimodal_tokens(ctx):
-    return max(ctx.model_config.hf_config.image_size2tokens.values())
+def get_max_aria_image_tokens(ctx: InputContext):
+    hf_config = ctx.get_hf_config()
+    return max(hf_config.projector_patch_to_query_dict.values())
 
 
-def input_mapper_for_aria(ctx, data):
-    return MultiModalKwargs(data)
+class AriaMultiModalProcessor(BaseMultiModalProcessor):
 
-
-def input_processor(ctx, llm_inputs):
-    multi_modal_data = llm_inputs.get("multi_modal_data")
-    # if it is pure text input, use it as is
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return llm_inputs
-
-    model_config = ctx.model_config
-
-    tokenizer = cached_get_tokenizer(model_config.tokenizer)
-    image_processor = cached_get_image_processor(
-        model_config.model, trust_remote_code=model_config.trust_remote_code)
-    hf_config = model_config.hf_config
-
-    # prepare image tokens, the max_image_size is used to determine the number
-    # of patch_size for every image
-    max_image_size = multi_modal_data.pop("max_image_size", 980)
-    _split_image = multi_modal_data.pop("split_image", False)
-
-    assert isinstance(max_image_size,
-                      (int, float)), "max_image_size should be float or int"
-    images = (multi_modal_data["image"] if isinstance(
-        multi_modal_data["image"], list) else [multi_modal_data["image"]])
-
-    image_inputs = image_processor.preprocess(images,
-                                              max_image_size=max_image_size,
-                                              split_image=_split_image,
-                                              return_tensors="pt").data
-    image_inputs['pixel_values'] = image_inputs['pixel_values'].to(
-        ctx.model_config.dtype)
-    num_crops = image_inputs.pop("num_crops")
-
-    prompt_token_ids = llm_inputs["prompt_token_ids"]
-    if num_crops.sum().item() > 0:
-        _, prompt_token_ids, _ = repeat_and_pad_placeholder_tokens(
-            tokenizer,
-            None,
-            prompt_token_ids,
-            placeholder_token_id=hf_config.image_token_index,
-            repeat_count=num_crops,
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            pixel_mask=MultiModalFieldConfig.batched("image"),
         )
 
-    repeat_count = [hf_config.image_size2tokens[max_image_size]
-                    ] * sum(num_crops).item()
-    new_prompt, new_token_ids, _ = repeat_and_pad_placeholder_tokens(
-        tokenizer,
-        None,
-        prompt_token_ids,
-        placeholder_token_id=hf_config.image_token_index,
-        repeat_count=repeat_count,
-    )
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        hf_config = self.ctx.get_hf_config()
+        image_token_id = hf_config.image_token_index
 
-    return token_inputs(
-        prompt_token_ids=new_token_ids,
-        prompt=new_prompt,
-        multi_modal_data={"image": image_inputs},
-    )
+        max_image_tokens = get_max_aria_image_tokens(self.ctx)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=[image_token_id] * max_image_tokens,
+            )
+        ]
+
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        hf_config = self.ctx.get_hf_config()
+        vision_config: AriaVisionConfig = hf_config.vision_config
+
+        max_image_size = vision_config.image_size
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=max_image_size,
+                                   height=max_image_size,
+                                   num_images=num_images)
+        }
+
+        hf_processor = self._get_hf_processor()
+        image_token: str = hf_processor.image_token  # type: ignore
+
+        return ProcessorInputs(
+            prompt_text=image_token * num_images,
+            mm_data=mm_data,
+        )
 
 
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_multimodal_tokens)
-@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_aria)
-@INPUT_REGISTRY.register_input_processor(input_processor)
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_aria_image_tokens)
+@MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor)
 class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
     """
     Aria model for conditional generation tasks.
@@ -540,12 +532,6 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
 
-        # prepare the image_size to tokens mapping for the image preprocess, see
-        # input_processor
-        config.image_size2tokens = {
-            int(math.sqrt(k) * config.vision_config.patch_size): v
-            for k, v in config.projector_patch_to_query_dict.items()
-        }
         self.config = config
         self.vision_tower = AriaVisionModel(config.vision_config)
         self.multi_modal_projector = build_mm_projector(config)
@@ -566,7 +552,7 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 self.vocab_size, logit_scale)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
 
     def _validate_image_sizes(
             self, images: List[torch.Tensor]) -> List[torch.Tensor]:
@@ -588,7 +574,12 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
 
         pixel_values = self._validate_image_sizes(pixel_values)
         pixel_values = flatten_bn(pixel_values, concat=True)
+
         if pixel_mask is not None:
+            if not isinstance(pixel_mask, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel mask. "
+                                 f"Got type: {type(pixel_mask)}")
+
             pixel_mask = flatten_bn(pixel_mask, concat=True)
 
         return AriaImagePixelInputs(
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index 42a239cadac46..987dfaf44f228 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -4,22 +4,16 @@ from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 import torch.nn as nn
-from PIL import Image
 from transformers import Blip2VisionConfig, BlipVisionConfig
 
 from vllm.attention.layer import MultiHeadAttention
-from vllm.config import ModelConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
-from vllm.inputs import DecoderOnlyInputs, token_inputs
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   repeat_and_pad_placeholder_tokens)
-from vllm.sequence import SequenceData
 
 
 def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
@@ -33,92 +27,6 @@ def get_blip_num_patches(*, image_size: int, patch_size: int) -> int:
     return grid_length * grid_length
 
 
-def get_blip_image_feature_size(
-        hf_config: Union[BlipVisionConfig, Blip2VisionConfig]) -> int:
-    return get_blip_num_patches(image_size=hf_config.image_size,
-                                patch_size=hf_config.patch_size)
-
-
-def get_max_blip_image_tokens(
-        hf_config: Union[BlipVisionConfig, Blip2VisionConfig]) -> int:
-    return get_blip_image_feature_size(hf_config)
-
-
-def dummy_seq_data_for_blip(
-    hf_config: Union[BlipVisionConfig, Blip2VisionConfig],
-    seq_len: int,
-    num_images: int,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[int] = None,
-):
-    if image_feature_size_override is None:
-        image_feature_size = get_blip_image_feature_size(hf_config)
-    else:
-        image_feature_size = image_feature_size_override
-
-    return SequenceData.from_prompt_token_counts(
-        (image_token_id, image_feature_size * num_images),
-        (0, seq_len - image_feature_size * num_images),
-    )
-
-
-def dummy_image_for_blip(
-    hf_config: Union[BlipVisionConfig, Blip2VisionConfig],
-    num_images: int,
-    *,
-    image_width_override: Optional[int] = None,
-    image_height_override: Optional[int] = None,
-):
-    width = height = hf_config.image_size
-    if image_width_override is not None:
-        width = image_width_override
-    if image_height_override is not None:
-        height = image_height_override
-
-    image = Image.new("RGB", (width, height), color=0)
-    return {"image": image if num_images == 1 else [image] * num_images}
-
-
-def input_processor_for_blip(
-    model_config: ModelConfig,
-    hf_config: Union[BlipVisionConfig, Blip2VisionConfig],
-    inputs: DecoderOnlyInputs,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[int] = None,
-):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
-
-    if "multi_modal_placeholders" in inputs and "image" in inputs[
-            "multi_modal_placeholders"]:
-        # The inputs already have placeholders.
-        return inputs
-
-    tokenizer = cached_get_tokenizer(model_config.tokenizer)
-
-    if image_feature_size_override is None:
-        image_feature_size = get_blip_image_feature_size(hf_config)
-    else:
-        image_feature_size = image_feature_size_override
-
-    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-        tokenizer,
-        inputs.get("prompt"),
-        inputs["prompt_token_ids"],
-        placeholder_token_id=image_token_id,
-        repeat_count=image_feature_size,
-    )
-
-    # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(prompt_token_ids=new_token_ids,
-                        prompt=new_prompt,
-                        multi_modal_data=multi_modal_data,
-                        multi_modal_placeholders={"image": ranges})
-
-
 # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa
 class BlipVisionEmbeddings(nn.Module):
 
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 76b8505ee1c2a..bf70f5d904f5b 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -4,32 +4,33 @@ from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
 
 import torch
 import torch.nn as nn
-from transformers import (Blip2Config, Blip2QFormerConfig, Blip2VisionConfig,
-                          apply_chunking_to_forward)
+from transformers import (BatchFeature, Blip2Config, Blip2Processor,
+                          Blip2QFormerConfig, apply_chunking_to_forward)
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
+from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
-from vllm.multimodal.utils import consecutive_placeholder_ranges
-from vllm.sequence import IntermediateTensors, SequenceData
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputsV2, MultiModalKwargs,
+                                    NestedTensors, PlaceholderRange)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
+from vllm.sequence import IntermediateTensors
 
-from .blip import (BlipVisionModel, dummy_image_for_blip,
-                   get_max_blip_image_tokens)
+from .blip import BlipVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
 # We use this internally as placeholders since there is no image token
 # defined on the HuggingFace repo
-BLIP2_IMAGE_TOKEN = "<image>"
-BLIP2_IMAGE_TOKEN_ID = 50265
+_IMAGE_TOKEN_ID = 50265
 
 
 class Blip2ImagePixelInputs(TypedDict):
@@ -396,92 +397,87 @@ class Blip2QFormerModel(nn.Module):
         return sequence_output
 
 
-def get_blip2_image_feature_size(hf_config: Blip2Config) -> int:
+def get_max_blip2_image_tokens(ctx: InputContext):
+    hf_config = ctx.get_hf_config(Blip2Config)
     return hf_config.num_query_tokens
 
 
-def get_max_blip2_image_tokens(ctx: InputContext):
-    hf_config = ctx.get_hf_config(Blip2Config)
-    vision_config = hf_config.vision_config
+class Blip2MultiModalProcessor(BaseMultiModalProcessor):
 
-    if isinstance(vision_config, Blip2VisionConfig):
-        return get_max_blip_image_tokens(vision_config)
+    def _get_hf_processor(self) -> Blip2Processor:
+        return self.ctx.get_hf_processor(Blip2Processor)
 
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        max_image_tokens = get_max_blip2_image_tokens(self.ctx)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target="</s>",
+                replacement="<image>" * max_image_tokens + "</s>",
+            )
+        ]
+
+    def apply(
+        self,
+        prompt_text: str,
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+
+        # Only <image> tokens should be considered as placeholders,
+        # so we ignore the trailing bos_token
+        result["mm_placeholders"] = {
+            modality: [
+                PlaceholderRange(offset=p["offset"], length=p["length"] - 1)
+                for p in ps
+            ]
+            for modality, ps in result["mm_placeholders"].items()
+        }
+
+        return result
+
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        hf_config = self.ctx.get_hf_config(Blip2Config)
+        vision_config = hf_config.vision_config
+
+        max_image_size = vision_config.image_size
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=max_image_size,
+                                   height=max_image_size,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="",
+            mm_data=mm_data,
+        )
 
 
-def dummy_seq_data_for_blip2(
-    hf_config: Blip2Config,
-    seq_len: int,
-    num_images: int,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[int] = None,
-):
-    if image_feature_size_override is None:
-        image_feature_size = get_blip2_image_feature_size(hf_config)
-    else:
-        image_feature_size = image_feature_size_override
-
-    return SequenceData.from_prompt_token_counts(
-        (image_token_id, image_feature_size * num_images),
-        (0, seq_len - image_feature_size * num_images),
-    ), {
-        "image":
-        consecutive_placeholder_ranges(num_items=num_images,
-                                       item_size=image_feature_size)
-    }
-
-
-def dummy_data_for_blip2(ctx: InputContext, seq_len: int,
-                         mm_counts: Mapping[str, int]):
-    hf_config = ctx.get_hf_config(Blip2Config)
-    vision_config = hf_config.vision_config
-    num_images = mm_counts["image"]
-
-    seq_data, ranges = dummy_seq_data_for_blip2(
-        hf_config,
-        seq_len,
-        num_images,
-        image_token_id=BLIP2_IMAGE_TOKEN_ID,
-    )
-
-    if isinstance(vision_config, Blip2VisionConfig):
-        mm_data = dummy_image_for_blip(vision_config, num_images)
-
-        return DummyData(seq_data, mm_data, ranges)
-
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
-
-
-def input_processor_for_blip2(ctx: InputContext, inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
-
-    hf_config = ctx.get_hf_config(Blip2Config)
-    image_feature_size = get_blip2_image_feature_size(hf_config)
-
-    # The original model places image tokens at the front
-    # https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1514
-    new_token_ids = [BLIP2_IMAGE_TOKEN_ID] * image_feature_size
-    new_token_ids += inputs["prompt_token_ids"]
-
-    new_prompt = inputs.get("prompt")
-    if new_prompt is not None:
-        new_prompt = BLIP2_IMAGE_TOKEN * image_feature_size + new_prompt
-
-    return token_inputs(prompt_token_ids=new_token_ids,
-                        prompt=new_prompt,
-                        multi_modal_data=multi_modal_data)
-
-
-@MULTIMODAL_REGISTRY.register_image_input_mapper()
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_blip2_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_blip2)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_blip2)
+@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor)
 class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -627,7 +623,7 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
         if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids, inputs_embeds, multimodal_embeddings,
-                BLIP2_IMAGE_TOKEN_ID)
+                _IMAGE_TOKEN_ID)
         return inputs_embeds
 
     def forward(
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index a40c321ce0a58..85fca23b05746 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -3,16 +3,15 @@ from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set,
                     Tuple, TypedDict, Union)
 
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
-from PIL import Image
-from torch import nn
-from transformers import ChameleonConfig, ChameleonVQVAEConfig
+from transformers import (BatchFeature, ChameleonConfig, ChameleonProcessor,
+                          ChameleonVQVAEConfig)
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
+from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -29,11 +28,13 @@ from vllm.model_executor.model_loader.weight_utils import (
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   consecutive_placeholder_ranges,
-                                   repeat_and_pad_placeholder_tokens)
-from vllm.sequence import IntermediateTensors, SequenceData
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputsV2, MultiModalKwargs,
+                                    NestedTensors, PlaceholderRange)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
+from vllm.sequence import IntermediateTensors
 from vllm.utils import print_warning_once
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -45,10 +46,6 @@ from .utils import (is_pp_missing_parameter,
 # and processor files, so we hardcode them in the model file for now.
 CHAMELEON_CROP_SIZE_HEIGHT = CHAMELEON_CROP_SIZE_WIDTH = 512
 CHAMELEON_IMAGE_SEQ_LENGTH = 1024
-CHAMELEON_IMAGE_TOKEN_ID = 8711
-CHAMELEON_IMAGE_START_TOKEN_ID = 8197
-CHAMELEON_IMAGE_END_TOKEN_ID = 8196
-CHAMELEON_SEP_TOKEN_ID = 8710
 
 
 class ChameleonImagePixelInputs(TypedDict):
@@ -61,99 +58,75 @@ def get_max_chameleon_image_tokens(ctx: InputContext):
     return CHAMELEON_IMAGE_SEQ_LENGTH
 
 
-def dummy_seq_data_for_chameleon(
-    seq_len: int,
-    num_images: int,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[int] = None,
-):
-    if image_feature_size_override is None:
-        image_feature_size = CHAMELEON_IMAGE_SEQ_LENGTH
-    else:
-        image_feature_size = image_feature_size_override
+class ChameleonMultiModalProcessor(BaseMultiModalProcessor):
 
-    return SequenceData.from_prompt_token_counts(
-        (image_token_id, image_feature_size * num_images),
-        (0, seq_len - image_feature_size * num_images),
-    ), {
-        "image":
-        consecutive_placeholder_ranges(num_items=num_images,
-                                       item_size=image_feature_size)
-    }
+    def _get_hf_processor(self) -> ChameleonProcessor:
+        return self.ctx.get_hf_processor(ChameleonProcessor)
 
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(pixel_values=MultiModalFieldConfig.batched("image"))
 
-def dummy_image_for_chameleon(
-    num_images: int,
-    *,
-    image_width_override: Optional[int] = None,
-    image_height_override: Optional[int] = None,
-):
-    width = CHAMELEON_CROP_SIZE_WIDTH
-    height = CHAMELEON_CROP_SIZE_HEIGHT
-    if image_width_override is not None:
-        width = image_width_override
-    if image_height_override is not None:
-        height = image_height_override
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        processor = self._get_hf_processor()
 
-    image = Image.new("RGB", (width, height), color=0)
-    return {"image": image if num_images == 1 else [image] * num_images}
+        return [
+            PromptReplacement(
+                modality="image",
+                target="<image>",
+                replacement="".join([
+                    processor.image_start_token,
+                    processor.image_token * CHAMELEON_IMAGE_SEQ_LENGTH,
+                    processor.image_end_token,
+                ]),
+            )
+        ]
 
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
 
-def dummy_data_for_chameleon(ctx: InputContext, seq_len: int,
-                             mm_counts: Mapping[str, int]):
-    num_images = mm_counts["image"]
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=CHAMELEON_CROP_SIZE_WIDTH,
+                                   height=CHAMELEON_CROP_SIZE_HEIGHT,
+                                   num_images=num_images)
+        }
 
-    seq_data, ranges = dummy_seq_data_for_chameleon(
-        seq_len,
-        num_images,
-        image_token_id=CHAMELEON_IMAGE_TOKEN_ID,
-    )
+        return ProcessorInputs(
+            prompt_text="<image>" * num_images,
+            mm_data=mm_data,
+        )
 
-    mm_data = dummy_image_for_chameleon(num_images)
-    return DummyData(seq_data, mm_data, ranges)
+    def apply(
+        self,
+        prompt_text: str,
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
 
+        # Only <image> tokens should be considered as placeholders,
+        # so we ignore the image_start_token and image_end_token
+        result["mm_placeholders"] = {
+            modality: [
+                PlaceholderRange(offset=p["offset"] + 1,
+                                 length=p["length"] - 2) for p in ps
+            ]
+            for modality, ps in result["mm_placeholders"].items()
+        }
 
-def input_processor_for_chameleon(ctx: InputContext,
-                                  inputs: DecoderOnlyInputs):
-
-    """
-    Processing input prompt to insert required tokens for image placeholder.
-
-    See https://github.com/huggingface/transformers/blob/0fdea8607d7e01eb0e38a1ebeb7feee30a22f0cf/src/transformers/models/chameleon/processing_chameleon.py#L58
-    """ # noqa
-
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
-
-    if "multi_modal_placeholders" in inputs and "image" in inputs[
-            "multi_modal_placeholders"]:
-        # The inputs already have placeholders.
-        return inputs
-
-    model_config = ctx.model_config
-    tokenizer = cached_get_tokenizer(model_config.tokenizer)
-    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-        tokenizer,
-        inputs.get("prompt"),
-        inputs["prompt_token_ids"],
-        placeholder_token_id=CHAMELEON_IMAGE_TOKEN_ID,
-        repeat_count=CHAMELEON_IMAGE_SEQ_LENGTH,
-        pad_token_left=CHAMELEON_IMAGE_START_TOKEN_ID,
-        pad_token_right=CHAMELEON_IMAGE_END_TOKEN_ID,
-    )
-
-    # Appending sep token for chat mode to follow default processor
-    # behavior
-    if new_prompt is not None:
-        new_prompt += tokenizer.sep_token
-    new_token_ids += [CHAMELEON_SEP_TOKEN_ID]
-
-    # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(prompt_token_ids=new_token_ids,
-                        prompt=new_prompt,
-                        multi_modal_data=multi_modal_data)
+        return result
 
 
 class ChameleonLayerNorm(nn.LayerNorm):
@@ -736,7 +709,7 @@ class ChameleonVQVAEEncoder(nn.Module):
         for i_level in range(self.num_resolutions):
             for i_block in range(self.num_res_blocks):
                 hidden_state = self.down[i_level].block[i_block](
-                    hidden_states[-1], )
+                    hidden_states[-1])
                 if len(self.down[i_level].attn) > 0:
                     hidden_state = self.down[i_level].attn[i_block](
                         hidden_state)
@@ -925,10 +898,8 @@ class ChameleonModel(nn.Module):
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper()
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_chameleon_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_chameleon)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_chameleon)
+@MULTIMODAL_REGISTRY.register_processor(ChameleonMultiModalProcessor)
 class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsPP):
 
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 6e86900326c4b..8c14866f20b92 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -15,32 +15,30 @@
 # limitations under the License.
 """ PyTorch Fuyu model."""
 import math
-from array import array
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict)
 
 import torch
 import torch.nn as nn
-import torch.utils.checkpoint
-from PIL import Image
-from transformers import FuyuImageProcessor
+from transformers import (BatchFeature, FuyuConfig, FuyuImageProcessor,
+                          FuyuProcessor)
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
+from vllm.inputs import InputContext
 from vllm.model_executor.layers.linear import ColumnParallelLinear
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.models.persimmon import PersimmonForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.image import cached_get_image_processor
-from vllm.multimodal.inputs import NestedTensors
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   consecutive_placeholder_ranges)
-from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
-                           SequenceData)
-from vllm.utils import is_list_of
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputsV2, MultiModalKwargs,
+                                    NestedTensors, PlaceholderRange)
+from vllm.multimodal.parse import ImageProcessorItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
+from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
@@ -54,178 +52,193 @@ MAX_IMAGE_FEATURE_SIZE_HEIGHT = 1080
 MAX_IMAGE_FEATURE_SIZE_WIDTH = 1920
 
 
-class FuyuImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
+class FuyuImagePatchInputs(TypedDict):
+    type: Literal["image_patches"]
     data: torch.Tensor
     """
     Shape: 
-    (batch_size, num_patches, patch_size_x * patch_size_y * num_channels)
+    `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)`
+    """
+
+    patches_per_image: List[int]
+    """
+    List of number of total patches for each image in the batch.
+    This is used to restore the first two dimensions of `data`.
     """
 
 
-def _calculate_num_image_tokens(
-    height: int,
-    width: int,
+def _get_fuyu_num_image_tokens(
+    image_height: int,
+    image_width: int,
 ) -> Tuple[int, int]:
     """
-    calculate number of image tokens needed for a given image size
-    The expected Fuyu image prompts is in format:
+    Calculate the number of image tokens needed for a given image size.
+
+    The expected Fuyu image prompts can be expressed as:
+
+    .. code-block::
         (image_token * ncols + newline_token) * nrows
-    args:
-        image_size: Tuple[int, int] - (width, height) of the image
-    returns:
-        ncols: int - number of image tokens in x direction
-        nrows: int - number of image tokens in y direction
+
+    Args:
+        image_size: Tuple[int, int] - `(width, height)` of the image
+
+    Returns:
+        ncols: int - number of image tokens in `x` direction
+        nrows: int - number of image tokens in `y` direction
     """
-    ncol = math.ceil(width / 30)
-    nrow = math.ceil(height / 30)
-    return ncol, nrow
-
-
-def get_max_fuyu_image_feature_size():
-
-    return _calculate_num_image_tokens(
-        height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
-        width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-    )
+    ncols = math.ceil(image_width / 30)
+    nrows = math.ceil(image_height / 30)
+    return ncols, nrows
 
 
 def get_max_fuyu_image_tokens(ctx: InputContext):
-    ncol, nrow = get_max_fuyu_image_feature_size()
-    return (ncol + 1) * nrow
-
-
-def dummy_seq_data_for_fuyu(ctx: InputContext, seq_len: int, num_images: int):
-    ncol, nrow = get_max_fuyu_image_feature_size()
-    image_feature_size = get_max_fuyu_image_tokens(ctx)
-
-    image_token_ids = (
-        array(VLLM_TOKEN_ID_ARRAY_TYPE, [_IMAGE_TOKEN_ID]) * ncol +
-        array(VLLM_TOKEN_ID_ARRAY_TYPE, [_NEWLINE_TOKEN_ID])) * nrow
-    token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, image_token_ids) * num_images
-    token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
-                       [0]) * (seq_len - image_feature_size * num_images)
-    return SequenceData(token_ids), {
-        "image":
-        consecutive_placeholder_ranges(num_items=num_images,
-                                       item_size=image_feature_size)
-    }
-
-
-def dummy_image_for_fuyu(
-    num_images: int,
-    *,
-    image_width: int,
-    image_height: int,
-):
-    image = Image.new("RGB", (image_width, image_height), color=0)
-    return {"image": image if num_images == 1 else [image] * num_images}
-
-
-def dummy_data_for_fuyu(ctx: InputContext, seq_len: int,
-                        mm_counts: Mapping[str, int]):
-    num_images = mm_counts["image"]
-    seq_data, ranges = dummy_seq_data_for_fuyu(ctx, seq_len, num_images)
-    mm_data = dummy_image_for_fuyu(num_images,
-                                   image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-                                   image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT)
-    return DummyData(seq_data, mm_data, ranges)
-
-
-def _fuyu_image_preprocess(image_processor: FuyuImageProcessor,
-                           data: List[Image.Image]):
-    image_encoding = image_processor.preprocess(data, return_tensors="pt")
-    batch_images = torch.stack([img[0] for img in image_encoding["images"]
-                                ]).unsqueeze(1)
-    image_unpadded_heights = torch.tensor(
-        image_encoding["image_unpadded_heights"])
-    image_unpadded_widths = torch.tensor(
-        image_encoding["image_unpadded_widths"])
-
-    batch_size = len(image_encoding["images"])
-    image_present = torch.ones(batch_size, 1, 1)
-    model_image_input = image_processor.preprocess_with_tokenizer_info(
-        image_input=batch_images,
-        image_present=image_present,
-        image_unpadded_h=image_unpadded_heights,
-        image_unpadded_w=image_unpadded_widths,
-        image_placeholder_id=_IMAGE_TOKEN_ID,
-        image_newline_id=_NEWLINE_TOKEN_ID,
-        variable_sized=True,
+    ncols, nrows = _get_fuyu_num_image_tokens(
+        image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+        image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
     )
-    return model_image_input
+
+    return (ncols + 1) * nrows
 
 
-def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
+class FuyuMultiModalProcessor(BaseMultiModalProcessor):
 
-    model_config = ctx.model_config
-    image_data = multi_modal_data["image"]
-    new_multi_modal_data = {}
-    image_list = image_data if isinstance(image_data, list) else [image_data]
+    def _get_hf_processor(self) -> FuyuProcessor:
+        return self.ctx.get_hf_processor(FuyuProcessor)
 
-    # process image data
-    if is_list_of(image_list, Image.Image):
-        # Fuyu's image_processor can also finish token padding
-        image_processor: FuyuImageProcessor = cached_get_image_processor(
-            model_config.model)
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
 
-        model_image_input = _fuyu_image_preprocess(image_processor, image_data)
-        image_patches = torch.cat([
-            image_patch[0]
-            for image_patch in model_image_input["image_patches"]
-        ])
-        new_multi_modal_data["image"] = image_patches
+        if not mm_data:
+            # Avoid warning from HF logger for text-only input
+            # Input_ids format: bos_token_id + prompt_token_ids + boa_token_id
+            # Tokenizer won't add boa_token_id by default, we add it manually.
+            tokenizer = self._get_tokenizer()
+            boa_token_id: int = tokenizer.vocab["<0x04>"]  # type: ignore
+            prompt_ids = tokenizer.encode(prompt) + [boa_token_id]
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
 
-    elif is_list_of(image_list, torch.Tensor):
-        raise NotImplementedError("Embeddings input is not supported yet")
-    else:
-        raise TypeError(f"Invalid image type: {type(image_data)}")
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
 
-    # process prompts
-    prompt = inputs.get("prompt")
-    prompt_token_ids = inputs["prompt_token_ids"]
-    tokenizer = cached_get_tokenizer(model_config.model)
-    # dim0 is batch_size, dim1 is subseq_size which will always be 1
-    image_input_ids: List[List[
-        torch.Tensor]] = model_image_input["image_input_ids"]
-    image_input_ids = image_input_ids[0][0].tolist()
-    bos_token = tokenizer.encode("<s>", add_special_tokens=False)[1:]
-    boa_token = tokenizer.encode("\x04", add_special_tokens=False)[1:]
+        image_patches = processed_outputs.get("image_patches")
+        if image_patches is not None:
+            images = mm_data["images"]
+            assert isinstance(images, list)
 
-    new_prompt = prompt + "\x04"
-    new_prompt_token_ids = image_input_ids + bos_token + prompt_token_ids[
-        1:] + boa_token
+            # Original output: (1, num_images, Pn, Px * Py * C)
+            # New output: (num_images, Pn, Px * Py * C)
+            assert (isinstance(image_patches, list)
+                    and len(image_patches) == 1)
+            assert (isinstance(image_patches[0], torch.Tensor)
+                    and len(image_patches[0]) == len(images))
 
-    return token_inputs(prompt=new_prompt,
-                        prompt_token_ids=new_prompt_token_ids,
-                        multi_modal_data=new_multi_modal_data)
+            processed_outputs["image_patches"] = image_patches[0]
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(image_patches=MultiModalFieldConfig.batched("image"))
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        hf_config = self.ctx.get_hf_config(FuyuConfig)
+        bos_token_id = hf_config.bos_token_id
+
+        tokenizer = self._get_tokenizer()
+        eot_token_id = tokenizer.bos_token_id
+        assert isinstance(eot_token_id, int)
+
+        hf_processor = self._get_hf_processor()
+        image_processor: FuyuImageProcessor = hf_processor.image_processor
+        target_size = image_processor.size
+        target_height, target_width = (target_size["height"],
+                                       target_size["width"])
+
+        def get_replacement_fuyu(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
+            width, height = image_size.width, image_size.height
+            if not (width <= target_width and height <= target_height):
+                height_scale_factor = target_height / height
+                width_scale_factor = target_width / width
+                optimal_scale_factor = min(height_scale_factor,
+                                           width_scale_factor)
+
+                height = int(height * optimal_scale_factor)
+                width = int(width * optimal_scale_factor)
+
+            ncols, nrows = _get_fuyu_num_image_tokens(
+                image_width=width,
+                image_height=height,
+            )
+
+            return (([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows +
+                    [bos_token_id])
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[eot_token_id],
+                replacement=get_replacement_fuyu,
+            )
+        ]
+
+    def apply(
+        self,
+        prompt_text: str,
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+
+        # Only |SPEAKER| (image) tokens should be considered as placeholders,
+        # so we ignore the trailing bos_token_id
+        result["mm_placeholders"] = {
+            modality: [
+                PlaceholderRange(offset=p["offset"], length=p["length"] - 1)
+                for p in ps
+            ]
+            for modality, ps in result["mm_placeholders"].items()
+        }
+
+        return result
+
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
+                                   height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="",
+            mm_data=mm_data,
+        )
 
 
-def input_mapper_for_fuyu(ctx: InputContext, data: object):
-    model_config = ctx.model_config
-    data_list = data if isinstance(data, list) else [data]
-    if is_list_of(data_list, Image.Image):
-        # Fuyu's image_processor can also finish token padding
-        image_processor: FuyuImageProcessor = cached_get_image_processor(
-            model_config.model)
-
-        model_image_input = _fuyu_image_preprocess(image_processor, data_list)
-        data = torch.stack([
-            image_patch[0]
-            for image_patch in model_image_input["image_patches"]
-        ])
-
-    # image has been processed with prompt in input processor
-    return MultiModalKwargs({"pixel_values": data})
-
-
-@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_fuyu)
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_fuyu_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_fuyu)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_fuyu)
+@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor)
 class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -280,28 +293,32 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         return data.to(self.vision_embed_tokens.weight.dtype)
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[FuyuImagePixelInputs]:
-        pixel_values = kwargs.pop("pixel_values", None)
-
-        if pixel_values is not None:
-            if not isinstance(pixel_values, (torch.Tensor, list)):
+            self, **kwargs: object) -> Optional[FuyuImagePatchInputs]:
+        image_patches = kwargs.pop("image_patches", None)
+        if image_patches is not None:
+            if not isinstance(image_patches, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of image patches. "
-                                 f"Got type: {type(pixel_values)}")
+                                 f"Got type: {type(image_patches)}")
 
-            return FuyuImagePixelInputs(
-                type="pixel_values",
+            image_patches_flat = flatten_bn(image_patches)
+
+            return FuyuImagePatchInputs(
+                type="image_patches",
                 data=self._validate_pixel_values(
-                    flatten_bn(pixel_values, concat=True)),
+                    flatten_bn(image_patches_flat, concat=True)),
+                patches_per_image=[x.size(0) for x in image_patches_flat],
             )
 
         return None
 
     def _process_image_input(
-            self, image_input: FuyuImagePixelInputs) -> torch.Tensor:
+            self, image_input: FuyuImagePatchInputs) -> NestedTensors:
+        image_patches = image_input["data"]
+        patches_per_image = image_input["patches_per_image"]
 
         assert self.vision_embed_tokens is not None
-        vision_embeddings, _ = self.vision_embed_tokens(image_input["data"])
-        return vision_embeddings
+        vision_embeddings, _ = self.vision_embed_tokens(image_patches)
+        return vision_embeddings.split(patches_per_image, dim=0)
 
     def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index e430a158d869a..4e42a4b6f9e64 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -69,7 +69,8 @@ class Idefics2VisionEmbeddings(nn.Module):
                 patch_attention_mask: torch.BoolTensor,
                 tgt_sizes: Optional[torch.IntTensor] = None) -> torch.Tensor:
         batch_size, _, max_im_h, max_im_w = pixel_values.shape
-        patch_embeds = self.patch_embedding(pixel_values)
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(target_dtype))
         embeddings = patch_embeds.flatten(2).transpose(1, 2)
         max_nb_patches_h, max_nb_patches_w = (
             max_im_h // self.patch_size,
@@ -309,7 +310,8 @@ class Idefics2VisionTransformer(nn.Module):
         hidden_states = self.embeddings(
             pixel_values=pixel_values,
             patch_attention_mask=patch_attention_mask,
-            tgt_sizes=tgt_sizes)
+            tgt_sizes=tgt_sizes,
+        )
         encoder_outputs = self.encoder(hidden_states)
         last_hidden_state = self.post_layernorm(encoder_outputs)
         return last_hidden_state
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 1d6ee2a0be72e..34dc7fa31ce6f 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -144,8 +144,8 @@ class LlavaMultiModalProcessor(BaseMultiModalProcessor):
                 # Original output: (1, num_images, C, H, W)
                 # New output: (num_images, C, H, W)
                 assert (isinstance(pixel_values, list)
-                        and len(pixel_values) == 1
-                        and isinstance(pixel_values[0], list)
+                        and len(pixel_values) == 1)
+                assert (isinstance(pixel_values[0], list)
                         and len(pixel_values[0]) == len(images))
 
                 processed_outputs["pixel_values"] = pixel_values[0]
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index a39f2f4124d05..5e70c11363c83 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -528,10 +528,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
         stacked_image_features = self._image_pixels_to_features(
             self.vision_tower, stacked_pixel_values)
 
-        return [
-            self.multi_modal_projector(image_features) for image_features in
-            torch.split(stacked_image_features, num_patches_per_batch)
-        ]
+        return torch.split(self.multi_modal_projector(stacked_image_features),
+                           num_patches_per_batch)
 
     def _process_image_input(
         self,
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 22d29f5bbc50c..2bce13792a88d 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -1,8 +1,8 @@
+import math
 from dataclasses import dataclass, fields
 from functools import cached_property
 from typing import Iterable, List, Mapping, Optional, Set, Tuple, Union
 
-import numpy
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -306,7 +306,7 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
         images: Optional[Union[List[List[torch.Tensor]], List[torch.Tensor],
                                torch.Tensor]] = None,
         image_tokens: Optional[torch.Tensor] = None,
-    ) -> Optional[List[torch.Tensor]]:
+    ) -> Tuple[Optional[List[torch.Tensor]], Optional[torch.Tensor]]:
         if images is None:
             return None, None
 
@@ -604,11 +604,11 @@ class VisionTransformer(nn.Module):
         return self.args.image_size // self.args.patch_size
 
     @property
-    def device(self) -> torch.device:
+    def device(self) -> torch.types.Device:
         return next(self.parameters()).device
 
     @property
-    def dtype(self) -> torch.device:
+    def dtype(self) -> torch.dtype:
         return next(self.parameters()).dtype
 
     @property
@@ -741,8 +741,8 @@ def get_pixtral_hf_image_feature_size(hf_config: PixtralVisionConfig,
     ratio = max(image_width / max_width, image_height / max_height)
 
     if ratio > 1:
-        image_width = int(numpy.ceil(image_width / ratio))
-        image_height = int(numpy.ceil(image_height / ratio))
+        image_width = int(math.ceil(image_width / ratio))
+        image_height = int(math.ceil(image_height / ratio))
 
     num_height_tokens, num_width_tokens = _get_pixtral_hf_num_image_tokens(
         (image_height, image_width),
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index e3d43b017f894..de55bc6bcc123 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -23,7 +23,6 @@ from functools import cached_property
 from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
                     Union)
 
-import numpy as np
 import torch
 import torch.nn as nn
 from transformers import BatchFeature
@@ -177,16 +176,19 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor):
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         feature_extractor = self._get_feature_extractor()
+
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
+        num_audios = mm_counts.get("audio", 0)
 
-        audio_count = mm_counts.get("audio", 0)
-        audio = np.zeros(audio_len)
-        data = {"audio": [audio] * audio_count}
+        mm_data = {
+            "audio":
+            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+        }
 
         return ProcessorInputs(
-            prompt_text="<|AUDIO|>" * audio_count,
-            mm_data=data,
+            prompt_text="<|AUDIO|>" * num_audios,
+            mm_data=mm_data,
         )
 
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 6181fe3dd13d8..1e485f87bb7a4 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -29,7 +29,6 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
-from PIL import Image
 from transformers import BatchFeature
 from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
                                           Qwen2VLProcessor)
@@ -882,12 +881,10 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
         self,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        num_images = mm_counts.get("image", 0)
         hf_processor = self._get_hf_processor()
-        image_token: str = hf_processor.image_token
         image_processor = _get_image_processor(hf_processor)
 
-        data = {}
+        image_token: str = hf_processor.image_token
         resized_height, resized_width = smart_resize(
             height=9999999,
             width=9999999,
@@ -895,14 +892,18 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
             min_pixels=image_processor.min_pixels,
             max_pixels=image_processor.max_pixels,
         )
+        num_images = mm_counts.get("image", 0)
 
-        dummy_image = Image.new("RGB", (resized_width, resized_height),
-                                color=0)
-        data["image"] = [dummy_image] * num_images
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=resized_width,
+                                   height=resized_height,
+                                   num_images=num_images)
+        }
 
         return ProcessorInputs(
             prompt_text=image_token * num_images,
-            mm_data=data,
+            mm_data=mm_data,
         )
 
 
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 7e853e5b90096..54be7fed3f2be 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -188,16 +188,19 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         feature_extractor = self._get_feature_extractor()
+
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
+        num_audios = mm_counts.get("audio", 0)
 
-        audio_count = mm_counts.get("audio", 0)
-        audio = np.zeros(audio_len)
-        data = {"audio": [audio] * audio_count}
+        mm_data = {
+            "audio":
+            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+        }
 
         return ProcessorInputs(
-            prompt_text="<|audio|>" * audio_count,
-            mm_data=data,
+            prompt_text="<|audio|>" * num_audios,
+            mm_data=mm_data,
         )
 
 
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 180489166b407..7712c3bcebe20 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,15 +1,17 @@
 import pickle
 import re
 from abc import ABC, abstractmethod
+from collections import defaultdict
 from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence
 from dataclasses import dataclass, field
 from functools import lru_cache
 from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union
 
 import numpy as np
+import numpy.typing as npt
 import torch
 from blake3 import blake3
-from PIL.Image import Image
+from PIL import Image
 from transformers import BatchFeature, ProcessorMixin
 
 from vllm.inputs import DummyData, InputProcessingContext
@@ -353,13 +355,13 @@ def _replace_matches(
 ) -> list[_S]:
     out_seqs = list[_S]()
     prev_end_idx = 0
-    next_idx_by_modality = {modality: 0 for modality in mm_item_counts}
+    next_idx_by_modality = defaultdict[str, int](lambda: 0)
 
     for match in _resolve_matches(prompt, matches):
         modality = match.modality
 
         item_idx = next_idx_by_modality[modality]
-        if item_idx >= mm_item_counts[modality]:
+        if item_idx >= mm_item_counts.get(modality, 0):
             continue
 
         start_idx = match.start_idx
@@ -513,7 +515,7 @@ class ProcessingCache:
             return obj.encode("utf-8")
         if isinstance(obj, bytes):
             return obj
-        if isinstance(obj, Image):
+        if isinstance(obj, Image.Image):
             return obj.tobytes()
 
         # Convertible to NumPy arrays
@@ -673,10 +675,14 @@ class BaseMultiModalProcessor(ABC):
         Given the original multi-modal items for this modality
         and HF-processed data, output the replacements to perform.
 
-        Note:
-            Even when the HF processor already performs replacement for us,
-            we still use this replacement information to determine
-            the placeholder token positions for each multi-modal item.
+        Notes:
+            - You should not assume that HF processor always performs prompt
+              replacement: in :meth:`_apply_hf_processor_missing`, this method
+              is called on text-only and multimodal-only inputs separately,
+              instead of passing them in the same call.
+            - The replacement information returned by this method is also used
+              to determine the placeholder token positions for each multi-modal
+              item.
         """
         raise NotImplementedError
 
@@ -710,6 +716,10 @@ class BaseMultiModalProcessor(ABC):
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
+        """
+        Call the HF processor on the prompt text and
+        associated multi-modal data.
+        """
         return self.ctx.call_hf_processor(
             self._get_hf_processor(**mm_kwargs),
             dict(text=prompt, **mm_data),
@@ -723,7 +733,8 @@ class BaseMultiModalProcessor(ABC):
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> tuple[list[int], MultiModalKwargs]:
         """
-        Apply the HF processor on the full prompt text and multi-modal data.
+        Wrapper of :meth:`_call_hf_processor` that applies
+        additional pre-processing and post-processing.
         """
         processor_data, passthrough_data = self._get_hf_mm_data(mm_items)
 
@@ -754,10 +765,11 @@ class BaseMultiModalProcessor(ABC):
         Apply the HF processor on the full prompt text, but only on the
         multi-modal data that are missing from the cache.
 
-        Note: We pass prompt text and multi-modal data into the HF processor
-        in separate calls to avoid HF prompt replacement being done for
-        cached items; instead, we rely on our own prompt replacement logic
-        for the full text.
+        Note:
+            We pass prompt text and multi-modal data into the HF processor
+            in separate calls to avoid HF prompt replacement being done for
+            cached items; instead, we rely on our own prompt replacement logic
+            (:meth:`_get_prompt_replacements`) for the full text.
         """
         mm_missing_counts = mm_missing_data_items.get_all_counts()
 
@@ -1010,6 +1022,36 @@ class BaseMultiModalProcessor(ABC):
             mm_placeholders=mm_placeholders,
         )
 
+    def _get_dummy_audios(
+        self,
+        *,
+        length: int,
+        num_audios: int,
+    ) -> list[npt.NDArray]:
+        audio = np.zeros((length, ))
+        return [audio] * num_audios
+
+    def _get_dummy_images(
+        self,
+        *,
+        width: int,
+        height: int,
+        num_images: int,
+    ) -> list[Image.Image]:
+        image = Image.new("RGB", (width, height), color=0)
+        return [image] * num_images
+
+    def _get_dummy_videos(
+        self,
+        *,
+        width: int,
+        height: int,
+        num_frames: int,
+        num_videos: int,
+    ) -> list[npt.NDArray]:
+        video = np.zeros((num_frames, width, height, 3))
+        return [video] * num_videos
+
     @abstractmethod
     def _get_dummy_mm_inputs(
         self,
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 87b12a6fb33c1..7b6ded6a27084 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -400,15 +400,19 @@ def repeat_and_pad_placeholder_tokens(
     placeholder_token_idx = 0
     for i, token in enumerate(prompt_token_ids):
         if token == placeholder_token_id:
+            curr_repeat_count = repeat_count[placeholder_token_idx]
             replacement_ids = repeat_and_pad_token(
                 placeholder_token_id,
-                repeat_count=repeat_count[placeholder_token_idx],
+                repeat_count=curr_repeat_count,
                 pad_token_left=pad_token_left,
                 pad_token_right=pad_token_right,
             )
+            offset = len(new_token_ids)
+            if pad_token_left is not None:
+                offset += 1
             placeholder_ranges.append({
-                "offset": len(new_token_ids),
-                "length": len(replacement_ids)
+                "offset": offset,
+                "length": curr_repeat_count,
             })
             new_token_ids.extend(replacement_ids)
             placeholder_token_idx += 1
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 509771b7e2e5a..a08a86d4007dc 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -647,10 +647,23 @@ class GPUModelRunner:
                 self.mm_registry.get_max_tokens_per_item_by_modality(
                     self.model_config).values())
 
-            max_num_mm_items = min(
+            max_num_mm_items_encoder_budget = min(
                 self.max_num_encoder_input_tokens,
                 self.encoder_cache_size) // max_tokens_per_mm_item
 
+            max_mm_items_per_req = max(
+                self.mm_registry.get_mm_limits_per_prompt(
+                    self.model_config).values())
+
+            # NOTE: We do not consider max_num_batched_tokens on purpose
+            # because the multimodal embeddings can be generated in advance
+            # and chunked prefilled.
+            max_num_mm_items_decoder_budget = self.max_num_reqs * \
+                max_mm_items_per_req
+
+            max_num_mm_items = min(max_num_mm_items_encoder_budget,
+                                   max_num_mm_items_decoder_budget)
+
             # Dummy data definition in V0 may contain multiple multimodal items
             # (e.g, multiple images) for a single request, therefore here we
             # always replicate first item by max_num_mm_items times since in V1

From 0c6f9985547d6b510d34c6c873db54abe03eb346 Mon Sep 17 00:00:00 2001
From: Yihua Cheng <yihua98@uchicago.edu>
Date: Tue, 31 Dec 2024 18:10:55 -0600
Subject: [PATCH 43/48] [Benchmark] Add benchmark script for CPU offloading 
 (#11533)

Signed-off-by: ApostaC <yihua98@uchicago.edu>
Co-authored-by: KuntaiDu <kuntai@uchicago.edu>
---
 .../benchmark_long_document_qa_throughput.py  | 184 ++++++++++++++++++
 1 file changed, 184 insertions(+)
 create mode 100644 benchmarks/benchmark_long_document_qa_throughput.py

diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py
new file mode 100644
index 0000000000000..13477ef535e86
--- /dev/null
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@@ -0,0 +1,184 @@
+"""
+Offline benchmark to test the long document QA throughput.
+
+Example usage:
+    # This command run the vllm with 50GB CPU memory for offloading
+    # The workload samples 8 different prompts with a default input
+    # length of 20000 tokens, then replicates each prompt 2 times 
+    # in random order.
+    python benchmark_long_document_qa_throughput.py \
+        --model meta-llama/Llama-2-7b-chat-hf \
+        --enable-prefix-caching \
+        --num-documents 8 \
+        --repeat-count 2 
+
+Commandline arguments:
+    --num-documents: The number of documents to sample prompts from.
+
+    --document-length: The length of each document in tokens. 
+                       (Optional, default: 20000)
+
+    --output-len: The number of tokens to generate for each prompt.
+                  (Optional, default: 10)
+
+    --repeat-count: The number of times to repeat each prompt.
+                    (Optional, default: 2)
+
+    --repeat-mode: The mode to repeat prompts. The supported modes are:
+        - 'random': shuffle the prompts randomly. (Default)
+        - 'tile': the entire prompt list is repeated in sequence. (Potentially
+                  lowest cache hit)
+        - 'interleave': each prompt is repeated consecutively before 
+                        moving to the next element. (Highest cache hit)
+    
+    --shuffle-seed: Random seed when the repeat mode is "random".
+                    (Optional, default: 0)
+
+In the meantime, it also supports all the vLLM engine args to initialize the 
+LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more
+details.
+"""
+
+import dataclasses
+import random
+import time
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
+    """
+    Test long document QA with the given prompts and sampling parameters.
+    Print the time spent in processing all the prompts.
+
+    Args:
+        llm: The language model used for generating responses.
+        sampling_params: Sampling parameter used to generate the response.
+        prompts: A list of prompt strings to be processed by the LLM.
+    """
+    start_time = time.time()
+    llm.generate(prompts, sampling_params=sampling_params)
+    end_time = time.time()
+    print(f"Time to execute all requests: {end_time - start_time:.4f} secs")
+
+
+def repeat_prompts(prompts, repeat_count, mode: str):
+    """
+    Repeat each prompt in the list for a specified number of times.
+    The order of prompts in the output list depends on the mode.
+
+    Args:
+        prompts: A list of prompts to be repeated.
+        repeat_count: The number of times each prompt is repeated.
+        mode: The mode of repetition. Supported modes are:
+            - 'random': Shuffle the prompts randomly after repetition.
+            - 'tile': Repeat the entire prompt list in sequence.
+              Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3].
+            - 'interleave': Repeat each prompt consecutively before moving to 
+              the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3].
+
+    Returns:
+        A list of repeated prompts in the specified order.
+
+    Raises:
+        ValueError: If an invalid mode is provided.
+    """
+    print("Repeat mode: ", mode)
+    if mode == 'random':
+        repeated_prompts = prompts * repeat_count
+        random.shuffle(repeated_prompts)
+        return repeated_prompts
+    elif mode == 'tile':
+        return prompts * repeat_count
+    elif mode == 'interleave':
+        repeated_prompts = []
+        for prompt in prompts:
+            repeated_prompts.extend([prompt] * repeat_count)
+        return repeated_prompts
+    else:
+        raise ValueError(f"Invalid mode: {mode}, only support "
+                         "'random', 'tile', 'interleave'")
+
+
+def main(args):
+    random.seed(args.shuffle_seed)
+
+    # Prepare the prompts:
+    # we append the document id at the beginning to avoid any of the document
+    # being the prefix of other documents
+    prompts = [
+        str(i) + ' '.join(['hi'] * args.document_length)
+        for i in range(args.num_documents)
+    ]
+
+    prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode)
+
+    warmup_prompts = [
+        "This is warm up request " + str(i) + \
+                ' '.join(['hi'] * args.document_length)
+        for i in range(args.num_documents)]
+
+    # Create the LLM engine
+    engine_args = EngineArgs.from_cli_args(args)
+    llm = LLM(**dataclasses.asdict(engine_args))
+    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
+
+    print("------warm up------")
+    test_long_document_qa(
+        llm=llm,
+        prompts=warmup_prompts,
+        sampling_params=sampling_params,
+    )
+
+    print("------start generating------")
+    test_long_document_qa(
+        llm=llm,
+        prompts=prompts,
+        sampling_params=sampling_params,
+    )
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description=
+        'Benchmark the performance with or without automatic prefix caching.')
+
+    parser.add_argument(
+        '--document-length',
+        type=int,
+        # Roughly the number of tokens for a system paper,
+        # excluding images
+        default=20000,
+        help='Range of input lengths for sampling prompts,'
+        'specified as "min:max" (e.g., "128:256").')
+
+    parser.add_argument('--num-documents',
+                        type=int,
+                        default=8,
+                        help='Range of input lengths for sampling prompts,'
+                        'specified as "min:max" (e.g., "128:256").')
+
+    parser.add_argument('--output-len', type=int, default=10)
+
+    parser.add_argument('--repeat-count',
+                        type=int,
+                        default=2,
+                        help='Number of times to repeat each prompt')
+
+    parser.add_argument("--repeat-mode",
+                        type=str,
+                        default='random',
+                        help='The mode to repeat prompts. The supported '
+                        'modes are "random", "tile", and "interleave". '
+                        'See repeat_prompts() in the source code for details.')
+
+    parser.add_argument("--shuffle-seed",
+                        type=int,
+                        default=0,
+                        help='Random seed when the repeat mode is "random"')
+
+    parser = EngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    main(args)

From 4db72e57f6e8da5e78285e9868e9327167bea973 Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Tue, 31 Dec 2024 18:21:51 -0800
Subject: [PATCH 44/48] [Bugfix][Refactor] Unify model management in frontend
 (#11660)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
---
 tests/entrypoints/openai/test_cli_args.py     |   2 +-
 tests/entrypoints/openai/test_lora_lineage.py |  32 ++-
 tests/entrypoints/openai/test_serving_chat.py |  20 +-
 ...rving_engine.py => test_serving_models.py} |  66 +++---
 vllm/entrypoints/openai/api_server.py         |  62 +++---
 vllm/entrypoints/openai/cli_args.py           |   2 +-
 vllm/entrypoints/openai/run_batch.py          |  15 +-
 vllm/entrypoints/openai/serving_chat.py       |  16 +-
 vllm/entrypoints/openai/serving_completion.py |  16 +-
 vllm/entrypoints/openai/serving_embedding.py  |   9 +-
 vllm/entrypoints/openai/serving_engine.py     | 192 ++--------------
 vllm/entrypoints/openai/serving_models.py     | 210 ++++++++++++++++++
 vllm/entrypoints/openai/serving_pooling.py    |   9 +-
 vllm/entrypoints/openai/serving_score.py      |   9 +-
 .../openai/serving_tokenization.py            |  12 +-
 15 files changed, 365 insertions(+), 307 deletions(-)
 rename tests/entrypoints/openai/{test_serving_engine.py => test_serving_models.py} (61%)
 create mode 100644 vllm/entrypoints/openai/serving_models.py

diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py
index 45e6980a94630..e49562ad6a21f 100644
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@@ -4,7 +4,7 @@ import pytest
 
 from vllm.entrypoints.openai.cli_args import (make_arg_parser,
                                               validate_parsed_serve_args)
-from vllm.entrypoints.openai.serving_engine import LoRAModulePath
+from vllm.entrypoints.openai.serving_models import LoRAModulePath
 from vllm.utils import FlexibleArgumentParser
 
 from ...utils import VLLM_PATH
diff --git a/tests/entrypoints/openai/test_lora_lineage.py b/tests/entrypoints/openai/test_lora_lineage.py
index ab39684c2f31a..ce4f85c13fff9 100644
--- a/tests/entrypoints/openai/test_lora_lineage.py
+++ b/tests/entrypoints/openai/test_lora_lineage.py
@@ -55,7 +55,10 @@ def server_with_lora_modules_json(zephyr_lora_files):
         "64",
     ]
 
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    # Enable the /v1/load_lora_adapter endpoint
+    envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
         yield remote_server
 
 
@@ -67,8 +70,8 @@ async def client_for_lora_lineage(server_with_lora_modules_json):
 
 
 @pytest.mark.asyncio
-async def test_check_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI,
-                                  zephyr_lora_files):
+async def test_static_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI,
+                                   zephyr_lora_files):
     models = await client_for_lora_lineage.models.list()
     models = models.data
     served_model = models[0]
@@ -81,3 +84,26 @@ async def test_check_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI,
     assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
     assert lora_models[0].id == "zephyr-lora"
     assert lora_models[1].id == "zephyr-lora2"
+
+
+@pytest.mark.asyncio
+async def test_dynamic_lora_lineage(
+        client_for_lora_lineage: openai.AsyncOpenAI, zephyr_lora_files):
+
+    response = await client_for_lora_lineage.post("load_lora_adapter",
+                                                  cast_to=str,
+                                                  body={
+                                                      "lora_name":
+                                                      "zephyr-lora-3",
+                                                      "lora_path":
+                                                      zephyr_lora_files
+                                                  })
+    # Ensure adapter loads before querying /models
+    assert "success" in response
+
+    models = await client_for_lora_lineage.models.list()
+    models = models.data
+    dynamic_lora_model = models[-1]
+    assert dynamic_lora_model.root == zephyr_lora_files
+    assert dynamic_lora_model.parent == MODEL_NAME
+    assert dynamic_lora_model.id == "zephyr-lora-3"
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 61677b65af342..97248f1150979 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -8,7 +8,8 @@ from vllm.config import MultiModalConfig
 from vllm.engine.multiprocessing.client import MQLLMEngineClient
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-from vllm.entrypoints.openai.serving_engine import BaseModelPath
+from vllm.entrypoints.openai.serving_models import (BaseModelPath,
+                                                    OpenAIServingModels)
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 MODEL_NAME = "openai-community/gpt2"
@@ -50,14 +51,13 @@ async def _async_serving_chat_init():
     engine = MockEngine()
     model_config = await engine.get_model_config()
 
+    models = OpenAIServingModels(model_config, BASE_MODEL_PATHS)
     serving_completion = OpenAIServingChat(engine,
                                            model_config,
-                                           BASE_MODEL_PATHS,
+                                           models,
                                            response_role="assistant",
                                            chat_template=CHAT_TEMPLATE,
                                            chat_template_content_format="auto",
-                                           lora_modules=None,
-                                           prompt_adapters=None,
                                            request_logger=None)
     return serving_completion
 
@@ -72,14 +72,14 @@ def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
 
+    models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=MockModelConfig())
     serving_chat = OpenAIServingChat(mock_engine,
                                      MockModelConfig(),
-                                     BASE_MODEL_PATHS,
+                                     models,
                                      response_role="assistant",
                                      chat_template=CHAT_TEMPLATE,
                                      chat_template_content_format="auto",
-                                     lora_modules=None,
-                                     prompt_adapters=None,
                                      request_logger=None)
     req = ChatCompletionRequest(
         model=MODEL_NAME,
@@ -115,14 +115,14 @@ def test_serving_chat_could_load_correct_generation_config():
     mock_engine.errored = False
 
     # Initialize the serving chat
+    models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
     serving_chat = OpenAIServingChat(mock_engine,
                                      mock_model_config,
-                                     BASE_MODEL_PATHS,
+                                     models,
                                      response_role="assistant",
                                      chat_template=CHAT_TEMPLATE,
                                      chat_template_content_format="auto",
-                                     lora_modules=None,
-                                     prompt_adapters=None,
                                      request_logger=None)
     req = ChatCompletionRequest(
         model=MODEL_NAME,
diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_models.py
similarity index 61%
rename from tests/entrypoints/openai/test_serving_engine.py
rename to tests/entrypoints/openai/test_serving_models.py
index 096ab6fa0ac09..96897dc730da2 100644
--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -4,11 +4,11 @@ from unittest.mock import MagicMock
 import pytest
 
 from vllm.config import ModelConfig
-from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.openai.protocol import (ErrorResponse,
                                               LoadLoraAdapterRequest,
                                               UnloadLoraAdapterRequest)
-from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.entrypoints.openai.serving_models import (BaseModelPath,
+                                                    OpenAIServingModels)
 from vllm.lora.request import LoRARequest
 
 MODEL_NAME = "meta-llama/Llama-2-7b"
@@ -19,47 +19,45 @@ LORA_UNLOADING_SUCCESS_MESSAGE = (
     "Success: LoRA adapter '{lora_name}' removed successfully.")
 
 
-async def _async_serving_engine_init():
-    mock_engine_client = MagicMock(spec=EngineClient)
+async def _async_serving_models_init() -> OpenAIServingModels:
     mock_model_config = MagicMock(spec=ModelConfig)
     # Set the max_model_len attribute to avoid missing attribute
     mock_model_config.max_model_len = 2048
 
-    serving_engine = OpenAIServing(mock_engine_client,
-                                   mock_model_config,
-                                   BASE_MODEL_PATHS,
-                                   lora_modules=None,
-                                   prompt_adapters=None,
-                                   request_logger=None)
-    return serving_engine
+    serving_models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
+                                         model_config=mock_model_config,
+                                         lora_modules=None,
+                                         prompt_adapters=None)
+
+    return serving_models
 
 
 @pytest.mark.asyncio
 async def test_serving_model_name():
-    serving_engine = await _async_serving_engine_init()
-    assert serving_engine._get_model_name(None) == MODEL_NAME
+    serving_models = await _async_serving_models_init()
+    assert serving_models.model_name(None) == MODEL_NAME
     request = LoRARequest(lora_name="adapter",
                           lora_path="/path/to/adapter2",
                           lora_int_id=1)
-    assert serving_engine._get_model_name(request) == request.lora_name
+    assert serving_models.model_name(request) == request.lora_name
 
 
 @pytest.mark.asyncio
 async def test_load_lora_adapter_success():
-    serving_engine = await _async_serving_engine_init()
+    serving_models = await _async_serving_models_init()
     request = LoadLoraAdapterRequest(lora_name="adapter",
                                      lora_path="/path/to/adapter2")
-    response = await serving_engine.load_lora_adapter(request)
+    response = await serving_models.load_lora_adapter(request)
     assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter')
-    assert len(serving_engine.lora_requests) == 1
-    assert serving_engine.lora_requests[0].lora_name == "adapter"
+    assert len(serving_models.lora_requests) == 1
+    assert serving_models.lora_requests[0].lora_name == "adapter"
 
 
 @pytest.mark.asyncio
 async def test_load_lora_adapter_missing_fields():
-    serving_engine = await _async_serving_engine_init()
+    serving_models = await _async_serving_models_init()
     request = LoadLoraAdapterRequest(lora_name="", lora_path="")
-    response = await serving_engine.load_lora_adapter(request)
+    response = await serving_models.load_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
     assert response.type == "InvalidUserInput"
     assert response.code == HTTPStatus.BAD_REQUEST
@@ -67,43 +65,43 @@ async def test_load_lora_adapter_missing_fields():
 
 @pytest.mark.asyncio
 async def test_load_lora_adapter_duplicate():
-    serving_engine = await _async_serving_engine_init()
+    serving_models = await _async_serving_models_init()
     request = LoadLoraAdapterRequest(lora_name="adapter1",
                                      lora_path="/path/to/adapter1")
-    response = await serving_engine.load_lora_adapter(request)
+    response = await serving_models.load_lora_adapter(request)
     assert response == LORA_LOADING_SUCCESS_MESSAGE.format(
         lora_name='adapter1')
-    assert len(serving_engine.lora_requests) == 1
+    assert len(serving_models.lora_requests) == 1
 
     request = LoadLoraAdapterRequest(lora_name="adapter1",
                                      lora_path="/path/to/adapter1")
-    response = await serving_engine.load_lora_adapter(request)
+    response = await serving_models.load_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
     assert response.type == "InvalidUserInput"
     assert response.code == HTTPStatus.BAD_REQUEST
-    assert len(serving_engine.lora_requests) == 1
+    assert len(serving_models.lora_requests) == 1
 
 
 @pytest.mark.asyncio
 async def test_unload_lora_adapter_success():
-    serving_engine = await _async_serving_engine_init()
+    serving_models = await _async_serving_models_init()
     request = LoadLoraAdapterRequest(lora_name="adapter1",
                                      lora_path="/path/to/adapter1")
-    response = await serving_engine.load_lora_adapter(request)
-    assert len(serving_engine.lora_requests) == 1
+    response = await serving_models.load_lora_adapter(request)
+    assert len(serving_models.lora_requests) == 1
 
     request = UnloadLoraAdapterRequest(lora_name="adapter1")
-    response = await serving_engine.unload_lora_adapter(request)
+    response = await serving_models.unload_lora_adapter(request)
     assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(
         lora_name='adapter1')
-    assert len(serving_engine.lora_requests) == 0
+    assert len(serving_models.lora_requests) == 0
 
 
 @pytest.mark.asyncio
 async def test_unload_lora_adapter_missing_fields():
-    serving_engine = await _async_serving_engine_init()
+    serving_models = await _async_serving_models_init()
     request = UnloadLoraAdapterRequest(lora_name="", lora_int_id=None)
-    response = await serving_engine.unload_lora_adapter(request)
+    response = await serving_models.unload_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
     assert response.type == "InvalidUserInput"
     assert response.code == HTTPStatus.BAD_REQUEST
@@ -111,9 +109,9 @@ async def test_unload_lora_adapter_missing_fields():
 
 @pytest.mark.asyncio
 async def test_unload_lora_adapter_not_found():
-    serving_engine = await _async_serving_engine_init()
+    serving_models = await _async_serving_models_init()
     request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter")
-    response = await serving_engine.unload_lora_adapter(request)
+    response = await serving_models.unload_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
     assert response.type == "InvalidUserInput"
     assert response.code == HTTPStatus.BAD_REQUEST
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index bac72d87376da..74fe378fdae42 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -58,7 +58,9 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
-from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import (BaseModelPath,
+                                                    OpenAIServingModels)
 from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling
 from vllm.entrypoints.openai.serving_score import OpenAIServingScores
 from vllm.entrypoints.openai.serving_tokenization import (
@@ -269,6 +271,10 @@ def base(request: Request) -> OpenAIServing:
     return tokenization(request)
 
 
+def models(request: Request) -> OpenAIServingModels:
+    return request.app.state.openai_serving_models
+
+
 def chat(request: Request) -> Optional[OpenAIServingChat]:
     return request.app.state.openai_serving_chat
 
@@ -336,10 +342,10 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request):
 
 @router.get("/v1/models")
 async def show_available_models(raw_request: Request):
-    handler = base(raw_request)
+    handler = models(raw_request)
 
-    models = await handler.show_available_models()
-    return JSONResponse(content=models.model_dump())
+    models_ = await handler.show_available_models()
+    return JSONResponse(content=models_.model_dump())
 
 
 @router.get("/version")
@@ -505,26 +511,22 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
     @router.post("/v1/load_lora_adapter")
     async def load_lora_adapter(request: LoadLoraAdapterRequest,
                                 raw_request: Request):
-        for route in [chat, completion, embedding]:
-            handler = route(raw_request)
-            if handler is not None:
-                response = await handler.load_lora_adapter(request)
-                if isinstance(response, ErrorResponse):
-                    return JSONResponse(content=response.model_dump(),
-                                        status_code=response.code)
+        handler = models(raw_request)
+        response = await handler.load_lora_adapter(request)
+        if isinstance(response, ErrorResponse):
+            return JSONResponse(content=response.model_dump(),
+                                status_code=response.code)
 
         return Response(status_code=200, content=response)
 
     @router.post("/v1/unload_lora_adapter")
     async def unload_lora_adapter(request: UnloadLoraAdapterRequest,
                                   raw_request: Request):
-        for route in [chat, completion, embedding]:
-            handler = route(raw_request)
-            if handler is not None:
-                response = await handler.unload_lora_adapter(request)
-                if isinstance(response, ErrorResponse):
-                    return JSONResponse(content=response.model_dump(),
-                                        status_code=response.code)
+        handler = models(raw_request)
+        response = await handler.unload_lora_adapter(request)
+        if isinstance(response, ErrorResponse):
+            return JSONResponse(content=response.model_dump(),
+                                status_code=response.code)
 
         return Response(status_code=200, content=response)
 
@@ -628,13 +630,18 @@ def init_app_state(
     resolved_chat_template = load_chat_template(args.chat_template)
     logger.info("Using supplied chat template:\n%s", resolved_chat_template)
 
+    state.openai_serving_models = OpenAIServingModels(
+        model_config=model_config,
+        base_model_paths=base_model_paths,
+        lora_modules=args.lora_modules,
+        prompt_adapters=args.prompt_adapters,
+    )
+    # TODO: The chat template is now broken for lora adapters :(
     state.openai_serving_chat = OpenAIServingChat(
         engine_client,
         model_config,
-        base_model_paths,
+        state.openai_serving_models,
         args.response_role,
-        lora_modules=args.lora_modules,
-        prompt_adapters=args.prompt_adapters,
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
@@ -646,16 +653,14 @@ def init_app_state(
     state.openai_serving_completion = OpenAIServingCompletion(
         engine_client,
         model_config,
-        base_model_paths,
-        lora_modules=args.lora_modules,
-        prompt_adapters=args.prompt_adapters,
+        state.openai_serving_models,
         request_logger=request_logger,
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
     ) if model_config.runner_type == "generate" else None
     state.openai_serving_pooling = OpenAIServingPooling(
         engine_client,
         model_config,
-        base_model_paths,
+        state.openai_serving_models,
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
@@ -663,7 +668,7 @@ def init_app_state(
     state.openai_serving_embedding = OpenAIServingEmbedding(
         engine_client,
         model_config,
-        base_model_paths,
+        state.openai_serving_models,
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
@@ -671,14 +676,13 @@ def init_app_state(
     state.openai_serving_scores = OpenAIServingScores(
         engine_client,
         model_config,
-        base_model_paths,
+        state.openai_serving_models,
         request_logger=request_logger
     ) if model_config.task == "score" else None
     state.openai_serving_tokenization = OpenAIServingTokenization(
         engine_client,
         model_config,
-        base_model_paths,
-        lora_modules=args.lora_modules,
+        state.openai_serving_models,
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 908f8c3532c9e..22206ef8dbfe6 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -12,7 +12,7 @@ from typing import List, Optional, Sequence, Union, get_args
 from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
 from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
                                          validate_chat_template)
-from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
+from vllm.entrypoints.openai.serving_models import (LoRAModulePath,
                                                     PromptAdapterPath)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
 from vllm.utils import FlexibleArgumentParser
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 572ed27b39083..822c0f5f7c211 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -20,7 +20,8 @@ from vllm.entrypoints.openai.protocol import (BatchRequestInput,
 # yapf: enable
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
-from vllm.entrypoints.openai.serving_engine import BaseModelPath
+from vllm.entrypoints.openai.serving_models import (BaseModelPath,
+                                                    OpenAIServingModels)
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, random_uuid
 from vllm.version import __version__ as VLLM_VERSION
@@ -213,13 +214,17 @@ async def main(args):
         request_logger = RequestLogger(max_log_len=args.max_log_len)
 
     # Create the openai serving objects.
+    openai_serving_models = OpenAIServingModels(
+        model_config=model_config,
+        base_model_paths=base_model_paths,
+        lora_modules=None,
+        prompt_adapters=None,
+    )
     openai_serving_chat = OpenAIServingChat(
         engine,
         model_config,
-        base_model_paths,
+        openai_serving_models,
         args.response_role,
-        lora_modules=None,
-        prompt_adapters=None,
         request_logger=request_logger,
         chat_template=None,
         chat_template_content_format="auto",
@@ -228,7 +233,7 @@ async def main(args):
     openai_serving_embedding = OpenAIServingEmbedding(
         engine,
         model_config,
-        base_model_paths,
+        openai_serving_models,
         request_logger=request_logger,
         chat_template=None,
         chat_template_content_format="auto",
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index d085333563d19..9ba5eeb7709c9 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -21,10 +21,8 @@ from vllm.entrypoints.openai.protocol import (
     ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage,
     DeltaToolCall, ErrorResponse, FunctionCall, PromptTokenUsageInfo,
     RequestResponseMetadata, ToolCall, UsageInfo)
-from vllm.entrypoints.openai.serving_engine import (BaseModelPath,
-                                                    LoRAModulePath,
-                                                    OpenAIServing,
-                                                    PromptAdapterPath)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.logger import init_logger
 from vllm.outputs import CompletionOutput, RequestOutput
@@ -42,11 +40,9 @@ class OpenAIServingChat(OpenAIServing):
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        base_model_paths: List[BaseModelPath],
+        models: OpenAIServingModels,
         response_role: str,
         *,
-        lora_modules: Optional[List[LoRAModulePath]],
-        prompt_adapters: Optional[List[PromptAdapterPath]],
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
         chat_template_content_format: ChatTemplateContentFormatOption,
@@ -57,9 +53,7 @@ class OpenAIServingChat(OpenAIServing):
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
-                         base_model_paths=base_model_paths,
-                         lora_modules=lora_modules,
-                         prompt_adapters=prompt_adapters,
+                         models=models,
                          request_logger=request_logger,
                          return_tokens_as_token_ids=return_tokens_as_token_ids)
 
@@ -126,7 +120,7 @@ class OpenAIServingChat(OpenAIServing):
                 prompt_adapter_request,
             ) = self._maybe_get_adapters(request)
 
-            model_name = self._get_model_name(lora_request)
+            model_name = self.models.model_name(lora_request)
 
             tokenizer = await self.engine_client.get_tokenizer(lora_request)
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index aaad7b8c7f44c..17197dce8da23 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -21,10 +21,8 @@ from vllm.entrypoints.openai.protocol import (CompletionLogProbs,
                                               RequestResponseMetadata,
                                               UsageInfo)
 # yapf: enable
-from vllm.entrypoints.openai.serving_engine import (BaseModelPath,
-                                                    LoRAModulePath,
-                                                    OpenAIServing,
-                                                    PromptAdapterPath)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams, SamplingParams
@@ -41,18 +39,14 @@ class OpenAIServingCompletion(OpenAIServing):
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        base_model_paths: List[BaseModelPath],
+        models: OpenAIServingModels,
         *,
-        lora_modules: Optional[List[LoRAModulePath]],
-        prompt_adapters: Optional[List[PromptAdapterPath]],
         request_logger: Optional[RequestLogger],
         return_tokens_as_token_ids: bool = False,
     ):
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
-                         base_model_paths=base_model_paths,
-                         lora_modules=lora_modules,
-                         prompt_adapters=prompt_adapters,
+                         models=models,
                          request_logger=request_logger,
                          return_tokens_as_token_ids=return_tokens_as_token_ids)
         diff_sampling_param = self.model_config.get_diff_sampling_param()
@@ -170,7 +164,7 @@ class OpenAIServingCompletion(OpenAIServing):
 
         result_generator = merge_async_iterators(*generators)
 
-        model_name = self._get_model_name(lora_request)
+        model_name = self.models.model_name(lora_request)
         num_prompts = len(engine_prompts)
 
         # Similar to the OpenAI API, when n != best_of, we do not stream the
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index b8fb9d6bd77f2..e7116a3d95d10 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -16,7 +16,8 @@ from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest,
                                               EmbeddingResponse,
                                               EmbeddingResponseData,
                                               ErrorResponse, UsageInfo)
-from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.logger import init_logger
 from vllm.outputs import (EmbeddingOutput, EmbeddingRequestOutput,
                           PoolingRequestOutput)
@@ -46,7 +47,7 @@ class OpenAIServingEmbedding(OpenAIServing):
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        base_model_paths: List[BaseModelPath],
+        models: OpenAIServingModels,
         *,
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
@@ -54,9 +55,7 @@ class OpenAIServingEmbedding(OpenAIServing):
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
-                         base_model_paths=base_model_paths,
-                         lora_modules=None,
-                         prompt_adapters=None,
+                         models=models,
                          request_logger=request_logger)
 
         self.chat_template = chat_template
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 5b6a089e4c319..319f869240036 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1,7 +1,5 @@
 import json
-import pathlib
 from concurrent.futures.thread import ThreadPoolExecutor
-from dataclasses import dataclass
 from http import HTTPStatus
 from typing import (Any, Callable, Dict, Iterable, Iterator, List, Mapping,
                     Optional, Sequence, Tuple, TypedDict, Union)
@@ -28,13 +26,10 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DetokenizeRequest,
                                               EmbeddingChatRequest,
                                               EmbeddingCompletionRequest,
-                                              ErrorResponse,
-                                              LoadLoraAdapterRequest,
-                                              ModelCard, ModelList,
-                                              ModelPermission, ScoreRequest,
+                                              ErrorResponse, ScoreRequest,
                                               TokenizeChatRequest,
-                                              TokenizeCompletionRequest,
-                                              UnloadLoraAdapterRequest)
+                                              TokenizeCompletionRequest)
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser
 # yapf: enable
 from vllm.inputs import TokensPrompt
@@ -48,30 +43,10 @@ from vllm.sequence import Logprob
 from vllm.tracing import (contains_trace_headers, extract_trace_headers,
                           log_tracing_disabled_warning)
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import AtomicCounter, is_list_of, make_async, random_uuid
+from vllm.utils import is_list_of, make_async, random_uuid
 
 logger = init_logger(__name__)
 
-
-@dataclass
-class BaseModelPath:
-    name: str
-    model_path: str
-
-
-@dataclass
-class PromptAdapterPath:
-    name: str
-    local_path: str
-
-
-@dataclass
-class LoRAModulePath:
-    name: str
-    path: str
-    base_model_name: Optional[str] = None
-
-
 CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest,
                               EmbeddingCompletionRequest, ScoreRequest,
                               TokenizeCompletionRequest]
@@ -96,10 +71,8 @@ class OpenAIServing:
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        base_model_paths: List[BaseModelPath],
+        models: OpenAIServingModels,
         *,
-        lora_modules: Optional[List[LoRAModulePath]],
-        prompt_adapters: Optional[List[PromptAdapterPath]],
         request_logger: Optional[RequestLogger],
         return_tokens_as_token_ids: bool = False,
     ):
@@ -109,35 +82,7 @@ class OpenAIServing:
         self.model_config = model_config
         self.max_model_len = model_config.max_model_len
 
-        self.base_model_paths = base_model_paths
-
-        self.lora_id_counter = AtomicCounter(0)
-        self.lora_requests = []
-        if lora_modules is not None:
-            self.lora_requests = [
-                LoRARequest(lora_name=lora.name,
-                            lora_int_id=i,
-                            lora_path=lora.path,
-                            base_model_name=lora.base_model_name
-                            if lora.base_model_name
-                            and self._is_model_supported(lora.base_model_name)
-                            else self.base_model_paths[0].name)
-                for i, lora in enumerate(lora_modules, start=1)
-            ]
-
-        self.prompt_adapter_requests = []
-        if prompt_adapters is not None:
-            for i, prompt_adapter in enumerate(prompt_adapters, start=1):
-                with pathlib.Path(prompt_adapter.local_path,
-                                  "adapter_config.json").open() as f:
-                    adapter_config = json.load(f)
-                    num_virtual_tokens = adapter_config["num_virtual_tokens"]
-                self.prompt_adapter_requests.append(
-                    PromptAdapterRequest(
-                        prompt_adapter_name=prompt_adapter.name,
-                        prompt_adapter_id=i,
-                        prompt_adapter_local_path=prompt_adapter.local_path,
-                        prompt_adapter_num_virtual_tokens=num_virtual_tokens))
+        self.models = models
 
         self.request_logger = request_logger
         self.return_tokens_as_token_ids = return_tokens_as_token_ids
@@ -150,33 +95,6 @@ class OpenAIServing:
             self._tokenize_prompt_input_or_inputs,
             executor=self._tokenizer_executor)
 
-    async def show_available_models(self) -> ModelList:
-        """Show available models. Right now we only have one model."""
-        model_cards = [
-            ModelCard(id=base_model.name,
-                      max_model_len=self.max_model_len,
-                      root=base_model.model_path,
-                      permission=[ModelPermission()])
-            for base_model in self.base_model_paths
-        ]
-        lora_cards = [
-            ModelCard(id=lora.lora_name,
-                      root=lora.local_path,
-                      parent=lora.base_model_name if lora.base_model_name else
-                      self.base_model_paths[0].name,
-                      permission=[ModelPermission()])
-            for lora in self.lora_requests
-        ]
-        prompt_adapter_cards = [
-            ModelCard(id=prompt_adapter.prompt_adapter_name,
-                      root=self.base_model_paths[0].name,
-                      permission=[ModelPermission()])
-            for prompt_adapter in self.prompt_adapter_requests
-        ]
-        model_cards.extend(lora_cards)
-        model_cards.extend(prompt_adapter_cards)
-        return ModelList(data=model_cards)
-
     def create_error_response(
             self,
             message: str,
@@ -205,11 +123,13 @@ class OpenAIServing:
     ) -> Optional[ErrorResponse]:
         if self._is_model_supported(request.model):
             return None
-        if request.model in [lora.lora_name for lora in self.lora_requests]:
+        if request.model in [
+                lora.lora_name for lora in self.models.lora_requests
+        ]:
             return None
         if request.model in [
                 prompt_adapter.prompt_adapter_name
-                for prompt_adapter in self.prompt_adapter_requests
+                for prompt_adapter in self.models.prompt_adapter_requests
         ]:
             return None
         return self.create_error_response(
@@ -223,10 +143,10 @@ class OpenAIServing:
             None, PromptAdapterRequest]]:
         if self._is_model_supported(request.model):
             return None, None
-        for lora in self.lora_requests:
+        for lora in self.models.lora_requests:
             if request.model == lora.lora_name:
                 return lora, None
-        for prompt_adapter in self.prompt_adapter_requests:
+        for prompt_adapter in self.models.prompt_adapter_requests:
             if request.model == prompt_adapter.prompt_adapter_name:
                 return None, prompt_adapter
         # if _check_model has been called earlier, this will be unreachable
@@ -588,91 +508,5 @@ class OpenAIServing:
             return logprob.decoded_token
         return tokenizer.decode(token_id)
 
-    async def _check_load_lora_adapter_request(
-            self, request: LoadLoraAdapterRequest) -> Optional[ErrorResponse]:
-        # Check if both 'lora_name' and 'lora_path' are provided
-        if not request.lora_name or not request.lora_path:
-            return self.create_error_response(
-                message="Both 'lora_name' and 'lora_path' must be provided.",
-                err_type="InvalidUserInput",
-                status_code=HTTPStatus.BAD_REQUEST)
-
-        # Check if the lora adapter with the given name already exists
-        if any(lora_request.lora_name == request.lora_name
-               for lora_request in self.lora_requests):
-            return self.create_error_response(
-                message=
-                f"The lora adapter '{request.lora_name}' has already been"
-                "loaded.",
-                err_type="InvalidUserInput",
-                status_code=HTTPStatus.BAD_REQUEST)
-
-        return None
-
-    async def _check_unload_lora_adapter_request(
-            self,
-            request: UnloadLoraAdapterRequest) -> Optional[ErrorResponse]:
-        # Check if either 'lora_name' or 'lora_int_id' is provided
-        if not request.lora_name and not request.lora_int_id:
-            return self.create_error_response(
-                message=
-                "either 'lora_name' and 'lora_int_id' needs to be provided.",
-                err_type="InvalidUserInput",
-                status_code=HTTPStatus.BAD_REQUEST)
-
-        # Check if the lora adapter with the given name exists
-        if not any(lora_request.lora_name == request.lora_name
-                   for lora_request in self.lora_requests):
-            return self.create_error_response(
-                message=
-                f"The lora adapter '{request.lora_name}' cannot be found.",
-                err_type="InvalidUserInput",
-                status_code=HTTPStatus.BAD_REQUEST)
-
-        return None
-
-    async def load_lora_adapter(
-            self,
-            request: LoadLoraAdapterRequest) -> Union[ErrorResponse, str]:
-        error_check_ret = await self._check_load_lora_adapter_request(request)
-        if error_check_ret is not None:
-            return error_check_ret
-
-        lora_name, lora_path = request.lora_name, request.lora_path
-        unique_id = self.lora_id_counter.inc(1)
-        self.lora_requests.append(
-            LoRARequest(lora_name=lora_name,
-                        lora_int_id=unique_id,
-                        lora_path=lora_path))
-        return f"Success: LoRA adapter '{lora_name}' added successfully."
-
-    async def unload_lora_adapter(
-            self,
-            request: UnloadLoraAdapterRequest) -> Union[ErrorResponse, str]:
-        error_check_ret = await self._check_unload_lora_adapter_request(request
-                                                                        )
-        if error_check_ret is not None:
-            return error_check_ret
-
-        lora_name = request.lora_name
-        self.lora_requests = [
-            lora_request for lora_request in self.lora_requests
-            if lora_request.lora_name != lora_name
-        ]
-        return f"Success: LoRA adapter '{lora_name}' removed successfully."
-
     def _is_model_supported(self, model_name):
-        return any(model.name == model_name for model in self.base_model_paths)
-
-    def _get_model_name(self, lora: Optional[LoRARequest]):
-        """
-        Returns the appropriate model name depending on the availability
-        and support of the LoRA or base model.
-        Parameters:
-        - lora: LoRARequest that contain a base_model_name.
-        Returns:
-        - str: The name of the base model or the first available model path.
-        """
-        if lora is not None:
-            return lora.lora_name
-        return self.base_model_paths[0].name
+        return self.models.is_base_model(model_name)
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
new file mode 100644
index 0000000000000..26966896bc272
--- /dev/null
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -0,0 +1,210 @@
+import json
+import pathlib
+from dataclasses import dataclass
+from http import HTTPStatus
+from typing import List, Optional, Union
+
+from vllm.config import ModelConfig
+from vllm.entrypoints.openai.protocol import (ErrorResponse,
+                                              LoadLoraAdapterRequest,
+                                              ModelCard, ModelList,
+                                              ModelPermission,
+                                              UnloadLoraAdapterRequest)
+from vllm.lora.request import LoRARequest
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.utils import AtomicCounter
+
+
+@dataclass
+class BaseModelPath:
+    name: str
+    model_path: str
+
+
+@dataclass
+class PromptAdapterPath:
+    name: str
+    local_path: str
+
+
+@dataclass
+class LoRAModulePath:
+    name: str
+    path: str
+    base_model_name: Optional[str] = None
+
+
+class OpenAIServingModels:
+    """Shared instance to hold data about the loaded base model(s) and adapters.
+
+    Handles the routes:
+    - /v1/models
+    - /v1/load_lora_adapter
+    - /v1/unload_lora_adapter
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        base_model_paths: List[BaseModelPath],
+        *,
+        lora_modules: Optional[List[LoRAModulePath]] = None,
+        prompt_adapters: Optional[List[PromptAdapterPath]] = None,
+    ):
+        super().__init__()
+
+        self.base_model_paths = base_model_paths
+        self.max_model_len = model_config.max_model_len
+
+        self.lora_id_counter = AtomicCounter(0)
+        self.lora_requests = []
+        if lora_modules is not None:
+            self.lora_requests = [
+                LoRARequest(lora_name=lora.name,
+                            lora_int_id=i,
+                            lora_path=lora.path,
+                            base_model_name=lora.base_model_name
+                            if lora.base_model_name
+                            and self.is_base_model(lora.base_model_name) else
+                            self.base_model_paths[0].name)
+                for i, lora in enumerate(lora_modules, start=1)
+            ]
+
+        self.prompt_adapter_requests = []
+        if prompt_adapters is not None:
+            for i, prompt_adapter in enumerate(prompt_adapters, start=1):
+                with pathlib.Path(prompt_adapter.local_path,
+                                  "adapter_config.json").open() as f:
+                    adapter_config = json.load(f)
+                    num_virtual_tokens = adapter_config["num_virtual_tokens"]
+                self.prompt_adapter_requests.append(
+                    PromptAdapterRequest(
+                        prompt_adapter_name=prompt_adapter.name,
+                        prompt_adapter_id=i,
+                        prompt_adapter_local_path=prompt_adapter.local_path,
+                        prompt_adapter_num_virtual_tokens=num_virtual_tokens))
+
+    def is_base_model(self, model_name):
+        return any(model.name == model_name for model in self.base_model_paths)
+
+    def model_name(self, lora_request: Optional[LoRARequest] = None) -> str:
+        """Returns the appropriate model name depending on the availability
+        and support of the LoRA or base model.
+        Parameters:
+        - lora: LoRARequest that contain a base_model_name.
+        Returns:
+        - str: The name of the base model or the first available model path.
+        """
+        if lora_request is not None:
+            return lora_request.lora_name
+        return self.base_model_paths[0].name
+
+    async def show_available_models(self) -> ModelList:
+        """Show available models. This includes the base model and all 
+        adapters"""
+        model_cards = [
+            ModelCard(id=base_model.name,
+                      max_model_len=self.max_model_len,
+                      root=base_model.model_path,
+                      permission=[ModelPermission()])
+            for base_model in self.base_model_paths
+        ]
+        lora_cards = [
+            ModelCard(id=lora.lora_name,
+                      root=lora.local_path,
+                      parent=lora.base_model_name if lora.base_model_name else
+                      self.base_model_paths[0].name,
+                      permission=[ModelPermission()])
+            for lora in self.lora_requests
+        ]
+        prompt_adapter_cards = [
+            ModelCard(id=prompt_adapter.prompt_adapter_name,
+                      root=self.base_model_paths[0].name,
+                      permission=[ModelPermission()])
+            for prompt_adapter in self.prompt_adapter_requests
+        ]
+        model_cards.extend(lora_cards)
+        model_cards.extend(prompt_adapter_cards)
+        return ModelList(data=model_cards)
+
+    async def load_lora_adapter(
+            self,
+            request: LoadLoraAdapterRequest) -> Union[ErrorResponse, str]:
+        error_check_ret = await self._check_load_lora_adapter_request(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        lora_name, lora_path = request.lora_name, request.lora_path
+        unique_id = self.lora_id_counter.inc(1)
+        self.lora_requests.append(
+            LoRARequest(lora_name=lora_name,
+                        lora_int_id=unique_id,
+                        lora_path=lora_path))
+        return f"Success: LoRA adapter '{lora_name}' added successfully."
+
+    async def unload_lora_adapter(
+            self,
+            request: UnloadLoraAdapterRequest) -> Union[ErrorResponse, str]:
+        error_check_ret = await self._check_unload_lora_adapter_request(request
+                                                                        )
+        if error_check_ret is not None:
+            return error_check_ret
+
+        lora_name = request.lora_name
+        self.lora_requests = [
+            lora_request for lora_request in self.lora_requests
+            if lora_request.lora_name != lora_name
+        ]
+        return f"Success: LoRA adapter '{lora_name}' removed successfully."
+
+    async def _check_load_lora_adapter_request(
+            self, request: LoadLoraAdapterRequest) -> Optional[ErrorResponse]:
+        # Check if both 'lora_name' and 'lora_path' are provided
+        if not request.lora_name or not request.lora_path:
+            return create_error_response(
+                message="Both 'lora_name' and 'lora_path' must be provided.",
+                err_type="InvalidUserInput",
+                status_code=HTTPStatus.BAD_REQUEST)
+
+        # Check if the lora adapter with the given name already exists
+        if any(lora_request.lora_name == request.lora_name
+               for lora_request in self.lora_requests):
+            return create_error_response(
+                message=
+                f"The lora adapter '{request.lora_name}' has already been"
+                "loaded.",
+                err_type="InvalidUserInput",
+                status_code=HTTPStatus.BAD_REQUEST)
+
+        return None
+
+    async def _check_unload_lora_adapter_request(
+            self,
+            request: UnloadLoraAdapterRequest) -> Optional[ErrorResponse]:
+        # Check if either 'lora_name' or 'lora_int_id' is provided
+        if not request.lora_name and not request.lora_int_id:
+            return create_error_response(
+                message=
+                "either 'lora_name' and 'lora_int_id' needs to be provided.",
+                err_type="InvalidUserInput",
+                status_code=HTTPStatus.BAD_REQUEST)
+
+        # Check if the lora adapter with the given name exists
+        if not any(lora_request.lora_name == request.lora_name
+                   for lora_request in self.lora_requests):
+            return create_error_response(
+                message=
+                f"The lora adapter '{request.lora_name}' cannot be found.",
+                err_type="InvalidUserInput",
+                status_code=HTTPStatus.BAD_REQUEST)
+
+        return None
+
+
+def create_error_response(
+        message: str,
+        err_type: str = "BadRequestError",
+        status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse:
+    return ErrorResponse(message=message,
+                         type=err_type,
+                         code=status_code.value)
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 01852f0df1eca..5830322071e58 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -15,7 +15,8 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse,
                                               PoolingChatRequest,
                                               PoolingRequest, PoolingResponse,
                                               PoolingResponseData, UsageInfo)
-from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.logger import init_logger
 from vllm.outputs import PoolingOutput, PoolingRequestOutput
 from vllm.utils import merge_async_iterators
@@ -44,7 +45,7 @@ class OpenAIServingPooling(OpenAIServing):
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        base_model_paths: List[BaseModelPath],
+        models: OpenAIServingModels,
         *,
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
@@ -52,9 +53,7 @@ class OpenAIServingPooling(OpenAIServing):
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
-                         base_model_paths=base_model_paths,
-                         lora_modules=None,
-                         prompt_adapters=None,
+                         models=models,
                          request_logger=request_logger)
 
         self.chat_template = chat_template
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index a8a126e697641..5d3e7139d7a17 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -10,7 +10,8 @@ from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (ErrorResponse, ScoreRequest,
                                               ScoreResponse, ScoreResponseData,
                                               UsageInfo)
-from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.inputs.data import TokensPrompt
 from vllm.logger import init_logger
 from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
@@ -50,15 +51,13 @@ class OpenAIServingScores(OpenAIServing):
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        base_model_paths: List[BaseModelPath],
+        models: OpenAIServingModels,
         *,
         request_logger: Optional[RequestLogger],
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
-                         base_model_paths=base_model_paths,
-                         lora_modules=None,
-                         prompt_adapters=None,
+                         models=models,
                          request_logger=request_logger)
 
     async def create_score(
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 2e849333680d4..b67ecfb01316f 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -15,9 +15,8 @@ from vllm.entrypoints.openai.protocol import (DetokenizeRequest,
                                               TokenizeRequest,
                                               TokenizeResponse)
 # yapf: enable
-from vllm.entrypoints.openai.serving_engine import (BaseModelPath,
-                                                    LoRAModulePath,
-                                                    OpenAIServing)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -29,18 +28,15 @@ class OpenAIServingTokenization(OpenAIServing):
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        base_model_paths: List[BaseModelPath],
+        models: OpenAIServingModels,
         *,
-        lora_modules: Optional[List[LoRAModulePath]],
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
         chat_template_content_format: ChatTemplateContentFormatOption,
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
-                         base_model_paths=base_model_paths,
-                         lora_modules=lora_modules,
-                         prompt_adapters=None,
+                         models=models,
                          request_logger=request_logger)
 
         self.chat_template = chat_template

From 365801feddaf5c4448704a1f55269dd992f5a4b1 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 1 Jan 2025 14:15:21 +0800
Subject: [PATCH 45/48] [VLM] Add max-count checking in data parser for single
 image models (#11661)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/supported_models.md  |  2 +-
 tests/multimodal/test_processing.py     |  3 ++-
 vllm/model_executor/models/blip2.py     |  4 ++++
 vllm/model_executor/models/chameleon.py |  4 ++++
 vllm/model_executor/models/fuyu.py      | 18 +++++++++-------
 vllm/multimodal/parse.py                | 28 +++++++++++++++++++++++--
 6 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index f74c201bdff6b..7682ed104b8c5 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -566,7 +566,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - [V1](gh-issue:8779)
 * - `AriaForConditionalGeneration`
   - Aria
-  - T + I
+  - T + I<sup>+</sup>
   - `rhymes-ai/Aria`
   -
   - ✅︎
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 81278cde264ff..1850ca46ccc8f 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -622,10 +622,11 @@ def _test_processing_cache_correctness(
 
 
 # yapf: disable
+# True if the model supports multiple data items of the modality per request
 @pytest.mark.parametrize(("model_id", "modalities"), [
     ("rhymes-ai/Aria", {"image": True}),
     ("Salesforce/blip2-opt-2.7b", {"image": False}),
-    ("facebook/chameleon-7b", {"image": True}),
+    ("facebook/chameleon-7b", {"image": False}),
     ("adept/fuyu-8b", {"image": False}),
     ("llava-hf/llava-1.5-7b-hf", {"image": True}),
     ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index bf70f5d904f5b..50680fadc4aa3 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -18,6 +18,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
+from vllm.multimodal.parse import MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
@@ -404,6 +405,9 @@ def get_max_blip2_image_tokens(ctx: InputContext):
 
 class Blip2MultiModalProcessor(BaseMultiModalProcessor):
 
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return MultiModalDataParser(max_mm_counts={"image": 1})
+
     def _get_hf_processor(self) -> Blip2Processor:
         return self.ctx.get_hf_processor(Blip2Processor)
 
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 85fca23b05746..c731934e792fc 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -31,6 +31,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
+from vllm.multimodal.parse import MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
@@ -60,6 +61,9 @@ def get_max_chameleon_image_tokens(ctx: InputContext):
 
 class ChameleonMultiModalProcessor(BaseMultiModalProcessor):
 
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return MultiModalDataParser(max_mm_counts={"image": 1})
+
     def _get_hf_processor(self) -> ChameleonProcessor:
         return self.ctx.get_hf_processor(ChameleonProcessor)
 
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 8c14866f20b92..0a48fa3fe11c0 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -34,7 +34,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
-from vllm.multimodal.parse import ImageProcessorItems
+from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
@@ -54,7 +54,7 @@ MAX_IMAGE_FEATURE_SIZE_WIDTH = 1920
 
 class FuyuImagePatchInputs(TypedDict):
     type: Literal["image_patches"]
-    data: torch.Tensor
+    flat_data: torch.Tensor
     """
     Shape: 
     `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)`
@@ -63,7 +63,7 @@ class FuyuImagePatchInputs(TypedDict):
     patches_per_image: List[int]
     """
     List of number of total patches for each image in the batch.
-    This is used to restore the first two dimensions of `data`.
+    This is used to restore the first two dimensions of `flat_data`.
     """
 
 
@@ -102,6 +102,9 @@ def get_max_fuyu_image_tokens(ctx: InputContext):
 
 class FuyuMultiModalProcessor(BaseMultiModalProcessor):
 
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return MultiModalDataParser(max_mm_counts={"image": 1})
+
     def _get_hf_processor(self) -> FuyuProcessor:
         return self.ctx.get_hf_processor(FuyuProcessor)
 
@@ -304,7 +307,7 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
             return FuyuImagePatchInputs(
                 type="image_patches",
-                data=self._validate_pixel_values(
+                flat_data=self._validate_pixel_values(
                     flatten_bn(image_patches_flat, concat=True)),
                 patches_per_image=[x.size(0) for x in image_patches_flat],
             )
@@ -313,12 +316,13 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
     def _process_image_input(
             self, image_input: FuyuImagePatchInputs) -> NestedTensors:
-        image_patches = image_input["data"]
+        image_patches_flat = image_input["flat_data"]
         patches_per_image = image_input["patches_per_image"]
 
         assert self.vision_embed_tokens is not None
-        vision_embeddings, _ = self.vision_embed_tokens(image_patches)
-        return vision_embeddings.split(patches_per_image, dim=0)
+        vision_embeddings_flat, _ = self.vision_embed_tokens(
+            image_patches_flat)
+        return vision_embeddings_flat.split(patches_per_image, dim=0)
 
     def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 17a795247372e..da111e999ebb8 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -220,11 +220,24 @@ ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],
 class MultiModalDataParser:
     """
     Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`.
+
+    Args:
+        max_mm_counts (Mapping[str, int]): The maximum allowed number of items
+            belonging to each modality. This effectively sets a hard limit over
+            `--limit-mm-per-prompt`.
+        target_sr (float, optional): Enables automatic resampling of audio
+            items to the model's expected sampling rate.
     """
 
-    def __init__(self, *, target_sr: Optional[float] = None) -> None:
+    def __init__(
+        self,
+        *,
+        max_mm_counts: Mapping[str, int] = {},
+        target_sr: Optional[float] = None,
+    ) -> None:
         super().__init__()
 
+        self.max_mm_counts = max_mm_counts
         self.target_sr = target_sr
 
     def _is_embeddings(self, data: object) -> TypeGuard[NestedTensors]:
@@ -332,6 +345,7 @@ class MultiModalDataParser:
 
     def parse_mm_data(self,
                       mm_data: MultiModalDataDict) -> MultiModalDataItems:
+        max_mm_counts = self.max_mm_counts
         subparsers = self._get_subparsers()
 
         mm_items = MultiModalDataItems()
@@ -339,6 +353,16 @@ class MultiModalDataParser:
             if k not in subparsers:
                 raise ValueError(f"Unsupported modality: {k}")
 
-            mm_items[k] = subparsers[k](v)
+            modality_items = subparsers[k](v)
+
+            if k in max_mm_counts:
+                max_count = max_mm_counts[k]
+                if len(modality_items) > max_count:
+                    raise ValueError(
+                        f"This model supports at most {max_count} {k} items "
+                        f"per prompt, but {len(modality_items)} {k} items "
+                        "were given or set as its limit_mm_per_prompt.")
+
+            mm_items[k] = modality_items
 
         return mm_items

From 11d8a091c6c775575a53d37408c94faa0b07730f Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 1 Jan 2025 14:42:23 +0800
Subject: [PATCH 46/48] [Misc] Optimize Qwen2-VL LoRA test (#11663)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_qwen2vl.py             |  5 ++---
 vllm/model_executor/models/qwen2_vl.py | 20 +++++++++++++++++++-
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index c9f48402b0268..ebdd129db5f6a 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -7,7 +7,7 @@ from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
 from vllm.platforms import current_platform
 
-MODEL_PATH = "Qwen/Qwen2-VL-7B-Instruct"
+MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
 
 PROMPT_TEMPLATE = (
     "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
@@ -49,10 +49,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     # Print the outputs.
     generated_texts: List[str] = []
     for output in outputs:
-        prompt = output.prompt
         generated_text = output.outputs[0].text.strip()
         generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        print(f"Generated text: {generated_text!r}")
     return generated_texts
 
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 1e485f87bb7a4..0df101b3dcce4 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -52,6 +52,7 @@ from vllm.model_executor.layers.quantization.gptq_marlin import (
     GPTQMarlinConfig)
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (ImageItem, ModalityData,
                                     MultiModalFieldConfig, MultiModalKwargs,
@@ -926,15 +927,23 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
     }
 
     # LoRA specific attributes
-    # TODO Support LoRA for the visual encoder in the future.
     supported_lora_modules = [
         "qkv_proj",
         "o_proj",
         "gate_up_proj",
         "down_proj",
+        # vision tower
+        "qkv",
+        "attn.proj",  # Distinguish patch_embed.proj
+        "fc1",
+        "fc2",
+        # projector
+        "mlp.0",
+        "mlp.2"
     ]
     embedding_modules = {}
     embedding_padding_modules = []
+
     # To ensure correct weight loading and mapping.
     hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
         "lm_head.": "language_model.lm_head.",
@@ -1231,3 +1240,12 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="visual.",
+            tower_model="visual.merger.")

From f962f426bc63b66301da61d2ac7078bf0ba941b0 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Tue, 31 Dec 2024 23:39:30 -0800
Subject: [PATCH 47/48] [Misc] Replace space with - in the file names (#11667)

Signed-off-by: Lu Fang <lufang@fb.com>
---
 .github/ISSUE_TEMPLATE/{400-bug report.yml => 400-bug-report.yml} | 0
 .../{500-feature request.yml => 500-feature-request.yml}          | 0
 .github/ISSUE_TEMPLATE/{600-new model.yml => 600-new-model.yml}   | 0
 ...-performance discussion.yml => 700-performance-discussion.yml} | 0
 .../{800-misc discussion.yml => 800-misc-discussion.yml}          | 0
 5 files changed, 0 insertions(+), 0 deletions(-)
 rename .github/ISSUE_TEMPLATE/{400-bug report.yml => 400-bug-report.yml} (100%)
 rename .github/ISSUE_TEMPLATE/{500-feature request.yml => 500-feature-request.yml} (100%)
 rename .github/ISSUE_TEMPLATE/{600-new model.yml => 600-new-model.yml} (100%)
 rename .github/ISSUE_TEMPLATE/{700-performance discussion.yml => 700-performance-discussion.yml} (100%)
 rename .github/ISSUE_TEMPLATE/{800-misc discussion.yml => 800-misc-discussion.yml} (100%)

diff --git a/.github/ISSUE_TEMPLATE/400-bug report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml
similarity index 100%
rename from .github/ISSUE_TEMPLATE/400-bug report.yml
rename to .github/ISSUE_TEMPLATE/400-bug-report.yml
diff --git a/.github/ISSUE_TEMPLATE/500-feature request.yml b/.github/ISSUE_TEMPLATE/500-feature-request.yml
similarity index 100%
rename from .github/ISSUE_TEMPLATE/500-feature request.yml
rename to .github/ISSUE_TEMPLATE/500-feature-request.yml
diff --git a/.github/ISSUE_TEMPLATE/600-new model.yml b/.github/ISSUE_TEMPLATE/600-new-model.yml
similarity index 100%
rename from .github/ISSUE_TEMPLATE/600-new model.yml
rename to .github/ISSUE_TEMPLATE/600-new-model.yml
diff --git a/.github/ISSUE_TEMPLATE/700-performance discussion.yml b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
similarity index 100%
rename from .github/ISSUE_TEMPLATE/700-performance discussion.yml
rename to .github/ISSUE_TEMPLATE/700-performance-discussion.yml
diff --git a/.github/ISSUE_TEMPLATE/800-misc discussion.yml b/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
similarity index 100%
rename from .github/ISSUE_TEMPLATE/800-misc discussion.yml
rename to .github/ISSUE_TEMPLATE/800-misc-discussion.yml

From 6d70198b17b008f5b845582590b96a507b4d68b5 Mon Sep 17 00:00:00 2001
From: Kazuhiro Serizawa <nserihiro@gmail.com>
Date: Wed, 1 Jan 2025 17:10:10 +0900
Subject: [PATCH 48/48] [Doc] Fix typo (#11666)

Signed-off-by: Kazuhiro Serizawa <nserihiro@gmail.com>
---
 vllm/model_executor/layers/rejection_sampler.py | 2 +-
 vllm/v1/sample/ops/topk_topp_sampler.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index 97a1b0c9603bd..165e8309fee64 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -39,7 +39,7 @@ class RejectionSampler(SpecDecodeStochasticBaseSampler):
             strict_mode: Whether or not to perform shape/device/dtype checks
             during sampling. This catches correctness issues but adds
             nontrivial latency.
-            use_falshinfer: We will use this parameter to determine whether
+            use_flashinfer: We will use this parameter to determine whether
             to use the FlashInfer rejection sampling kernel or not. If it's
             None, we will use the default value from the environment variable.
             This parameter is only used for testing purposes.
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index c088c3c129ca5..f2007d85c61a5 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -44,7 +44,7 @@ class TopKTopPSampler(nn.Module):
                 logger.warning(
                     "FlashInfer is not available. Falling back to the PyTorch-"
                     "native implementation of top-p & top-k sampling. For the "
-                    "best performance, please install FalshInfer.")
+                    "best performance, please install FlashInfer.")
                 self.forward = self.forward_native
         else:
             self.forward = self.forward_native