From 7311f74468d2ba4f89658aa0fedf3811f8769b30 Mon Sep 17 00:00:00 2001
From: Kebe <mail@kebe7jun.com>
Date: Fri, 25 Jul 2025 18:42:23 +0800
Subject: [PATCH 01/57] [Bugfix] GGUF: fix AttributeError: 'PosixPath' object
 has no attribute 'startswith' (#21579)

Signed-off-by: Kebe <mail@kebe7jun.com>
---
 vllm/transformers_utils/config.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 8d1f59e6eadf1..da475c3b50a39 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -584,7 +584,7 @@ def get_pooling_config_name(pooling_name: str) -> Union[str, None]:
 
 
 @cache
-def get_sentence_transformer_tokenizer_config(model: str,
+def get_sentence_transformer_tokenizer_config(model: Union[str, Path],
                                               revision: Optional[str] = 'main'
                                               ):
     """
@@ -592,7 +592,7 @@ def get_sentence_transformer_tokenizer_config(model: str,
     given Sentence Transformer BERT model.
 
     Parameters:
-    - model (str): The name of the Sentence Transformer
+    - model (str|Path): The name of the Sentence Transformer
     BERT model.
     - revision (str, optional): The revision of the m
     odel to use. Defaults to 'main'.
@@ -620,7 +620,7 @@ def get_sentence_transformer_tokenizer_config(model: str,
             if encoder_dict:
                 break
 
-    if not encoder_dict and not model.startswith("/"):
+    if not encoder_dict and not Path(model).is_absolute():
         try:
             # If model is on HuggingfaceHub, get the repo files
             repo_files = list_repo_files(model,

From 5c3f2628d5ddad55fd350ef733167fd7050e4ac6 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 25 Jul 2025 18:57:34 +0800
Subject: [PATCH 02/57] [Quantization] Enable BNB support for more MoE models
 (#21370)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/models/dots1.py    | 148 +++++++++++++------------
 vllm/model_executor/models/glm4_moe.py |  25 +++--
 2 files changed, 93 insertions(+), 80 deletions(-)

diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py
index 4bdcbfabbbc26..9b21a79446138 100644
--- a/vllm/model_executor/models/dots1.py
+++ b/vllm/model_executor/models/dots1.py
@@ -54,8 +54,8 @@ from vllm.model_executor.model_loader.weight_utils import (
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsPP
-from .utils import (PPMissingLayer, is_pp_missing_parameter,
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -327,6 +327,7 @@ class Dots1DecoderLayer(nn.Module):
         return hidden_states, residual
 
 
+@support_torch_compile
 class Dots1Model(nn.Module):
 
     fall_back_to_pt_during_load = False
@@ -404,68 +405,12 @@ class Dots1Model(nn.Module):
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-
-@support_torch_compile
-class Dots1ForCausalLM(nn.Module, SupportsPP):
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        self.config = config
-        self.quant_config = quant_config
-        self.model = Dots1Model(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
-        if get_pp_group().is_last_rank:
-            self.lm_head = ParallelLMHead(config.vocab_size,
-                                          config.hidden_size,
-                                          quant_config=quant_config)
-        else:
-            self.lm_head = PPMissingLayer()
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(
-            input_ids,
-            positions,
-            intermediate_tensors,
-            inputs_embeds,
-        )
-        return hidden_states
-
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
-        return logits
-
-    def make_empty_intermediate_tensors(
-            self, batch_size: int, dtype: torch.dtype,
-            device: torch.device) -> IntermediateTensors:
-        return IntermediateTensors({
-            "hidden_states":
-            torch.zeros((batch_size, self.config.hidden_size),
-                        dtype=dtype,
-                        device=device),
-            "residual":
-            torch.zeros((batch_size, self.config.hidden_size),
-                        dtype=dtype,
-                        device=device),
-        })
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts)
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
@@ -477,14 +422,9 @@ class Dots1ForCausalLM(nn.Module, SupportsPP):
             ("gate_up_proj", "up_proj", 1),
         ]
 
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
-            ckpt_gate_proj_name="gate_proj",
-            ckpt_down_proj_name="down_proj",
-            ckpt_up_proj_name="up_proj",
-            num_experts=self.config.n_routed_experts)
-
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -534,3 +474,71 @@ class Dots1ForCausalLM(nn.Module, SupportsPP):
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
+
+
+class Dots1ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Dots1Model(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(config.vocab_size,
+                                          config.hidden_size,
+                                          quant_config=quant_config)
+        else:
+            self.lm_head = PPMissingLayer()
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            intermediate_tensors,
+            inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index 43824abb571a6..6a196fef572de 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -53,7 +53,7 @@ from vllm.model_executor.model_loader.weight_utils import (
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsPP
+from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -461,6 +461,15 @@ class Glm4MoeModel(nn.Module):
                         device=device),
         })
 
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts)
+
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
@@ -472,16 +481,9 @@ class Glm4MoeModel(nn.Module):
             ("gate_up_proj", "up_proj", 1),
         ]
 
-        # Params for weights, fp8 weight scales, fp8 activation scales
-        # (param_name, weight_name, expert_id, shard_id)
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
-            ckpt_gate_proj_name="gate_proj",
-            ckpt_down_proj_name="down_proj",
-            ckpt_up_proj_name="up_proj",
-            num_experts=self.config.n_routed_experts)
-
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
         for name, loaded_weight in weights:
             spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
             if spec_layer is not None:
@@ -570,7 +572,7 @@ class Glm4MoeModel(nn.Module):
         return loaded_params
 
 
-class Glm4MoeForCausalLM(nn.Module, SupportsPP):
+class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -677,6 +679,9 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP):
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
 
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
+
 
 def get_spec_layer_idx_from_weight_name(config: PretrainedConfig,
                                         weight_name: str) -> Optional[int]:

From 46d81d69511ac11f27d19af80d19dd7b2cce8613 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 25 Jul 2025 20:36:45 +0800
Subject: [PATCH 03/57] [V1] Get supported tasks from model runner instead of
 model config (#21585)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/entrypoints/llm.py                  | 24 +++++++++++-----
 vllm/entrypoints/openai/api_server.py    | 32 +++++++++++++---------
 vllm/entrypoints/openai/run_batch.py     | 21 +++++++++-----
 vllm/executor/executor_base.py           |  8 +++---
 vllm/model_executor/layers/pooler.py     |  3 +-
 vllm/model_executor/models/bert.py       |  2 +-
 vllm/model_executor/models/gritlm.py     |  2 +-
 vllm/model_executor/models/modernbert.py |  2 +-
 vllm/pooling_params.py                   |  5 ++--
 vllm/tasks.py                            | 11 ++++++++
 vllm/v1/engine/async_llm.py              |  4 +++
 vllm/v1/engine/core.py                   | 11 ++++++--
 vllm/v1/engine/core_client.py            | 16 +++++++++++
 vllm/v1/engine/llm_engine.py             |  4 +++
 vllm/v1/worker/gpu_model_runner.py       | 35 +++++++++++++++++++++---
 vllm/v1/worker/gpu_worker.py             |  6 ++--
 vllm/v1/worker/tpu_model_runner.py       | 31 +++++++++++++++++++--
 vllm/v1/worker/tpu_worker.py             |  6 ++--
 vllm/worker/model_runner_base.py         | 31 +++++++++++++++++++--
 19 files changed, 200 insertions(+), 54 deletions(-)
 create mode 100644 vllm/tasks.py

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 2f766a2dae57a..2c961156bc845 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -14,6 +14,7 @@ from pydantic import ValidationError
 from tqdm.auto import tqdm
 from typing_extensions import TypeVar, deprecated
 
+import vllm.envs as envs
 from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
                               BeamSearchSequence,
                               create_sort_beams_key_function)
@@ -44,9 +45,10 @@ from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput,
                           PoolingRequestOutput, RequestOutput,
                           ScoringRequestOutput)
-from vllm.pooling_params import PoolingParams, PoolingTask
+from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
                                   RequestOutputKind, SamplingParams)
+from vllm.tasks import PoolingTask
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
                                                get_cached_tokenizer)
 from vllm.usage.usage_lib import UsageContext
@@ -277,6 +279,16 @@ class LLM:
         self.request_counter = Counter()
         self.default_sampling_params: Union[dict[str, Any], None] = None
 
+        if envs.VLLM_USE_V1:
+            supported_tasks = self.llm_engine \
+                .get_supported_tasks()  # type: ignore
+        else:
+            supported_tasks = self.llm_engine.model_config.supported_tasks
+
+        logger.info("Supported_tasks: %s", supported_tasks)
+
+        self.supported_tasks = supported_tasks
+
     def get_tokenizer(
         self,
         lora_request: Optional[LoRARequest] = None,
@@ -1170,8 +1182,7 @@ class LLM:
             A list of `EmbeddingRequestOutput` objects containing the
             embedding vectors in the same order as the input prompts.
         """
-        model_config = self.llm_engine.model_config
-        if "embed" not in model_config.supported_tasks:
+        if "embed" not in self.supported_tasks:
             raise ValueError("Embedding API is not supported by this model. "
                              "Please set `--task embed`.")
 
@@ -1215,8 +1226,7 @@ class LLM:
             A list of `ClassificationRequestOutput` objects containing the
             embedding vectors in the same order as the input prompts.
         """
-        model_config = self.llm_engine.model_config
-        if "classify" not in model_config.supported_tasks:
+        if "classify" not in self.supported_tasks:
             raise ValueError(
                 "Classification API is not supported by this model. "
                 "Please set `--task classify`.")
@@ -1397,8 +1407,8 @@ class LLM:
 
             raise ValueError(" ".join(messages))
 
-        if all(t not in model_config.supported_tasks
-               for t in ("embed", "classify")):
+        supported_tasks = self.supported_tasks
+        if all(t not in supported_tasks for t in ("embed", "classify")):
             raise ValueError("Score API is not supported by this model. "
                              "Please set `--task embed` or `--task classify`.")
 
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 8540d25d4e94d..5b87aed06e9ba 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1586,6 +1586,14 @@ async def init_app_state(
     state.vllm_config = vllm_config
     model_config = vllm_config.model_config
 
+    if envs.VLLM_USE_V1:
+        supported_tasks = await engine_client \
+            .get_supported_tasks()  # type: ignore
+    else:
+        supported_tasks = model_config.supported_tasks
+
+    logger.info("Supported_tasks: %s", supported_tasks)
+
     resolved_chat_template = load_chat_template(args.chat_template)
     if resolved_chat_template is not None:
         # Get the tokenizer to check official template
@@ -1647,7 +1655,7 @@ async def init_app_state(
         reasoning_parser=args.reasoning_parser,
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
         enable_force_include_usage=args.enable_force_include_usage,
-    ) if "generate" in model_config.supported_tasks else None
+    ) if "generate" in supported_tasks else None
     state.openai_serving_chat = OpenAIServingChat(
         engine_client,
         model_config,
@@ -1664,7 +1672,7 @@ async def init_app_state(
         reasoning_parser=args.reasoning_parser,
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
         enable_force_include_usage=args.enable_force_include_usage,
-    ) if "generate" in model_config.supported_tasks else None
+    ) if "generate" in supported_tasks else None
     state.openai_serving_completion = OpenAIServingCompletion(
         engine_client,
         model_config,
@@ -1673,7 +1681,7 @@ async def init_app_state(
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
         enable_force_include_usage=args.enable_force_include_usage,
-    ) if "generate" in model_config.supported_tasks else None
+    ) if "generate" in supported_tasks else None
     state.openai_serving_pooling = OpenAIServingPooling(
         engine_client,
         model_config,
@@ -1681,7 +1689,7 @@ async def init_app_state(
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
-    ) if "encode" in model_config.supported_tasks else None
+    ) if "encode" in supported_tasks else None
     state.openai_serving_embedding = OpenAIServingEmbedding(
         engine_client,
         model_config,
@@ -1689,24 +1697,22 @@ async def init_app_state(
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
-    ) if "embed" in model_config.supported_tasks else None
+    ) if "embed" in supported_tasks else None
     state.openai_serving_classification = ServingClassification(
         engine_client,
         model_config,
         state.openai_serving_models,
         request_logger=request_logger,
-    ) if "classify" in model_config.supported_tasks else None
+    ) if "classify" in supported_tasks else None
 
-    enable_serving_reranking = ("classify" in model_config.supported_tasks
-                                and getattr(model_config.hf_config,
-                                            "num_labels", 0) == 1)
+    enable_serving_reranking = ("classify" in supported_tasks and getattr(
+        model_config.hf_config, "num_labels", 0) == 1)
     state.openai_serving_scores = ServingScores(
         engine_client,
         model_config,
         state.openai_serving_models,
         request_logger=request_logger,
-    ) if ("embed" in model_config.supported_tasks
-          or enable_serving_reranking) else None
+    ) if ("embed" in supported_tasks or enable_serving_reranking) else None
 
     state.openai_serving_tokenization = OpenAIServingTokenization(
         engine_client,
@@ -1721,13 +1727,13 @@ async def init_app_state(
         model_config,
         state.openai_serving_models,
         request_logger=request_logger,
-    ) if "transcription" in model_config.supported_tasks else None
+    ) if "transcription" in supported_tasks else None
     state.openai_serving_translation = OpenAIServingTranslation(
         engine_client,
         model_config,
         state.openai_serving_models,
         request_logger=request_logger,
-    ) if "transcription" in model_config.supported_tasks else None
+    ) if "transcription" in supported_tasks else None
     state.task = model_config.task
 
     state.enable_server_load_tracking = args.enable_server_load_tracking
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 5770550923270..137b368dad202 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -14,6 +14,7 @@ import torch
 from prometheus_client import start_http_server
 from tqdm import tqdm
 
+import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
 from vllm.engine.protocol import EngineClient
@@ -335,6 +336,14 @@ async def run_batch(
 
     model_config = vllm_config.model_config
 
+    if envs.VLLM_USE_V1:
+        supported_tasks = await engine_client \
+            .get_supported_tasks()  # type: ignore
+    else:
+        supported_tasks = model_config.supported_tasks
+
+    logger.info("Supported_tasks: %s", supported_tasks)
+
     # Create the openai serving objects.
     openai_serving_models = OpenAIServingModels(
         engine_client=engine_client,
@@ -351,7 +360,7 @@ async def run_batch(
         chat_template=None,
         chat_template_content_format="auto",
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
-    ) if "generate" in model_config.supported_tasks else None
+    ) if "generate" in supported_tasks else None
     openai_serving_embedding = OpenAIServingEmbedding(
         engine_client,
         model_config,
@@ -359,19 +368,17 @@ async def run_batch(
         request_logger=request_logger,
         chat_template=None,
         chat_template_content_format="auto",
-    ) if "embed" in model_config.supported_tasks else None
+    ) if "embed" in supported_tasks else None
 
-    enable_serving_reranking = ("classify" in model_config.supported_tasks
-                                and getattr(model_config.hf_config,
-                                            "num_labels", 0) == 1)
+    enable_serving_reranking = ("classify" in supported_tasks and getattr(
+        model_config.hf_config, "num_labels", 0) == 1)
 
     openai_serving_scores = ServingScores(
         engine_client,
         model_config,
         openai_serving_models,
         request_logger=request_logger,
-    ) if ("embed" in model_config.supported_tasks
-          or enable_serving_reranking) else None
+    ) if ("embed" in supported_tasks or enable_serving_reranking) else None
 
     tracker = BatchProgressTracker()
     logger.info("Reading batch from %s...", args.input_file)
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 483fdb1486f79..97d0d6f08b81e 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -16,8 +16,8 @@ from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.pooling_params import PoolingTask
 from vllm.sequence import ExecuteModelRequest, PoolerOutput
+from vllm.tasks import SupportedTask
 from vllm.utils import make_async
 from vllm.worker.worker_base import WorkerBase
 
@@ -136,9 +136,9 @@ class ExecutorBase(ABC):
         return self.collective_rpc(rpc_func)
 
     @cached_property  # Avoid unnecessary RPC calls
-    def supported_pooling_tasks(self) -> tuple[PoolingTask, ...]:
-        output = self.collective_rpc("get_supported_pooling_tasks")
-        return tuple({task for tasks in output for task in tasks})
+    def supported_tasks(self) -> tuple[SupportedTask, ...]:
+        output = self.collective_rpc("get_supported_tasks")
+        return output[0]
 
     def execute_model(
         self, execute_model_req: ExecuteModelRequest
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index c06cca080227e..5bfd4aaccc17c 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -16,8 +16,9 @@ from vllm.config import ModelConfig, PoolerConfig
 from vllm.model_executor.pooling_metadata import (  # noqa: E501
     PoolingMetadata as V0PoolingMetadata)
 from vllm.model_executor.pooling_metadata import PoolingTensors
-from vllm.pooling_params import PoolingParams, PoolingTask
+from vllm.pooling_params import PoolingParams
 from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
+from vllm.tasks import PoolingTask
 from vllm.utils import resolve_obj_by_qualname
 from vllm.v1.pool.metadata import PoolingMetadata as V1PoolingMetadata
 
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 9dc6115f850ec..c3066aaa2b87d 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -26,8 +26,8 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.pooling_metadata import PoolingMetadata
-from vllm.pooling_params import PoolingTask
 from vllm.sequence import IntermediateTensors
+from vllm.tasks import PoolingTask
 
 from .interfaces import SupportsCrossEncoding, SupportsQuant, SupportsV0Only
 from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 8a3fbc6a49f04..c99970284a953 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -16,8 +16,8 @@ from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler,
                                                get_prompt_token_ids)
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.model_executor.pooling_metadata import PoolingMetadata
-from vllm.pooling_params import PoolingTask
 from vllm.sequence import PoolerOutput
+from vllm.tasks import PoolingTask
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from .interfaces import SupportsV0Only
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index be1c3438d9db1..fc2b0c1f51821 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -23,8 +23,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.pooling_metadata import PoolingMetadata
-from vllm.pooling_params import PoolingTask
 from vllm.sequence import IntermediateTensors
+from vllm.tasks import PoolingTask
 
 from .interfaces import SupportsCrossEncoding, SupportsV0Only
 from .utils import WeightsMapper, maybe_prefix
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 868facbe2557a..23eb775f2dc69 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -1,17 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING, Literal, Optional
+from typing import TYPE_CHECKING, Optional
 
 import msgspec
 
 from vllm.sampling_params import RequestOutputKind
+from vllm.tasks import PoolingTask
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
 
-PoolingTask = Literal["encode", "embed", "classify", "score"]
-
 
 class PoolingParams(
         msgspec.Struct,
diff --git a/vllm/tasks.py b/vllm/tasks.py
new file mode 100644
index 0000000000000..85c5c6e436205
--- /dev/null
+++ b/vllm/tasks.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Literal, get_args
+
+GenerationTask = Literal["generate", "transcription"]
+GENERATION_TASKS = get_args(GenerationTask)
+
+PoolingTask = Literal["encode", "embed", "classify", "score"]
+POOLING_TASKS = get_args(PoolingTask)
+
+SupportedTask = Literal[GenerationTask, PoolingTask]
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 02cb80197fa47..ed0d9620f4762 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -21,6 +21,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
+from vllm.tasks import SupportedTask
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -211,6 +212,9 @@ class AsyncLLM(EngineClient):
         if handler := getattr(self, "output_handler", None):
             handler.cancel()
 
+    async def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
+        return await self.engine_core.get_supported_tasks_async()
+
     async def add_request(
         self,
         request_id: str,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 88c511606d7c5..4124ee05326ce 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -23,6 +23,7 @@ from vllm.executor.multiproc_worker_utils import _add_prefix
 from vllm.logger import init_logger
 from vllm.logging_utils.dump_input import dump_engine_exception
 from vllm.lora.request import LoRARequest
+from vllm.tasks import POOLING_TASKS, SupportedTask
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.utils import (bind_process_name, make_zmq_socket,
@@ -195,11 +196,17 @@ class EngineCore:
                      "warmup model) took %.2f seconds"), elapsed)
         return num_gpu_blocks, num_cpu_blocks, scheduler_kv_cache_config
 
+    def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
+        return self.model_executor.supported_tasks
+
     def add_request(self, request: EngineCoreRequest):
         """Add request to the scheduler."""
         if pooling_params := request.pooling_params:
-            supported_pooling_tasks = (
-                self.model_executor.supported_pooling_tasks)
+            supported_pooling_tasks = [
+                task for task in self.get_supported_tasks()
+                if task in POOLING_TASKS
+            ]
+
             if pooling_params.task not in supported_pooling_tasks:
                 raise ValueError(f"Unsupported task: {pooling_params.task!r} "
                                  f"Supported tasks: {supported_pooling_tasks}")
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 69ae3690d00e9..b14d85bbf8e9d 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -21,6 +21,7 @@ import zmq.asyncio
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.tasks import SupportedTask
 from vllm.utils import get_open_port, get_open_zmq_inproc_path, make_zmq_socket
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType,
@@ -104,6 +105,9 @@ class EngineCoreClient(ABC):
     def get_output(self) -> EngineCoreOutputs:
         raise NotImplementedError
 
+    def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
+        raise NotImplementedError
+
     def add_request(self, request: EngineCoreRequest) -> None:
         raise NotImplementedError
 
@@ -170,6 +174,9 @@ class EngineCoreClient(ABC):
     async def get_output_async(self) -> EngineCoreOutputs:
         raise NotImplementedError
 
+    async def get_supported_tasks_async(self) -> tuple[SupportedTask, ...]:
+        raise NotImplementedError
+
     async def add_request_async(self, request: EngineCoreRequest) -> None:
         raise NotImplementedError
 
@@ -238,6 +245,9 @@ class InprocClient(EngineCoreClient):
         outputs, _ = self.engine_core.step()
         return outputs.get(0) or EngineCoreOutputs()
 
+    def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
+        return self.engine_core.get_supported_tasks()
+
     def add_request(self, request: EngineCoreRequest) -> None:
         self.engine_core.add_request(request)
 
@@ -608,6 +618,9 @@ class SyncMPClient(MPClient):
 
         return future.result()
 
+    def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
+        return self.call_utility("get_supported_tasks")
+
     def add_request(self, request: EngineCoreRequest) -> None:
         if self.is_dp:
             self.engines_running = True
@@ -802,6 +815,9 @@ class AsyncMPClient(MPClient):
         self._ensure_output_queue_task()
         return await future
 
+    async def get_supported_tasks_async(self) -> tuple[SupportedTask, ...]:
+        return await self.call_utility_async("get_supported_tasks")
+
     async def add_request_async(self, request: EngineCoreRequest) -> None:
         request.client_index = self.client_index
         await self._send_input(EngineCoreRequestType.ADD, request)
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 991242e18278d..efbdffbc0900d 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -18,6 +18,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
+from vllm.tasks import SupportedTask
 from vllm.transformers_utils.tokenizer_group import (
     TokenizerGroup, init_tokenizer_from_configs)
 from vllm.usage.usage_lib import UsageContext
@@ -176,6 +177,9 @@ class LLMEngine:
     def validate_outputs(cls, outputs, output_type):
         return outputs
 
+    def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
+        return self.engine_core.get_supported_tasks()
+
     def abort_request(self, request_ids: list[str]) -> None:
         """Remove request_ids from EngineCore and Detokenizer."""
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 32004ced4aae0..5fe594db667a5 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -30,15 +30,17 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaBase
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
-from vllm.model_executor.models.interfaces import is_mixture_of_experts
-from vllm.model_executor.models.interfaces_base import (VllmModelForPooling,
-                                                        is_pooling_model)
+from vllm.model_executor.models.interfaces import (is_mixture_of_experts,
+                                                   supports_transcription)
+from vllm.model_executor.models.interfaces_base import (
+    VllmModelForPooling, is_pooling_model, is_text_generation_model)
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.multimodal.utils import group_mm_inputs_by_modality
-from vllm.pooling_params import PoolingParams, PoolingTask
+from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors, PoolerOutput
+from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
                         GiB_bytes, LazyLoader, check_use_alibi, get_dtype_size,
                         is_pin_memory_available, round_up)
@@ -1153,6 +1155,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     def get_model(self) -> nn.Module:
         return self.model
 
+    def get_supported_generation_tasks(self) -> list[GenerationTask]:
+        model = self.get_model()
+        supported_tasks = list[GenerationTask]()
+
+        if is_text_generation_model(model):
+            supported_tasks.append("generate")
+
+        if supports_transcription(model):
+            if model.supports_transcription_only:
+                return ["transcription"]
+
+            supported_tasks.append("transcription")
+
+        return supported_tasks
+
     def get_supported_pooling_tasks(self) -> list[PoolingTask]:
         model = self.get_model()
         if not is_pooling_model(model):
@@ -1160,6 +1177,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         return list(model.pooler.get_supported_tasks())
 
+    def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
+        tasks = list[SupportedTask]()
+
+        if self.model_config.runner_type == "generate":
+            tasks.extend(self.get_supported_generation_tasks())
+        if self.model_config.runner_type == "pooling":
+            tasks.extend(self.get_supported_pooling_tasks())
+
+        return tuple(tasks)
+
     def apply_grammar_bitmask(
         self,
         scheduler_output: "SchedulerOutput",
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 522946351148b..dcfb038d28c20 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -23,8 +23,8 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
-from vllm.pooling_params import PoolingTask
 from vllm.sequence import IntermediateTensors
+from vllm.tasks import SupportedTask
 from vllm.utils import GiB_bytes, MemorySnapshot, memory_profiling
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
@@ -320,8 +320,8 @@ class Worker(WorkerBase):
     def get_model(self) -> nn.Module:
         return self.model_runner.get_model()
 
-    def get_supported_pooling_tasks(self) -> list[PoolingTask]:
-        return self.model_runner.get_supported_pooling_tasks()
+    def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
+        return self.model_runner.get_supported_tasks()
 
     @torch.inference_mode()
     def execute_model(
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index e8c8008458960..59cbb0150570b 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -27,13 +27,15 @@ from vllm.logger import init_logger
 from vllm.lora.layers import BaseLayerWithLoRA
 from vllm.model_executor.model_loader import get_model_loader
 from vllm.model_executor.model_loader.tpu import TPUModelLoader
-from vllm.model_executor.models.interfaces_base import is_pooling_model
+from vllm.model_executor.models.interfaces import supports_transcription
+from vllm.model_executor.models.interfaces_base import (
+    is_pooling_model, is_text_generation_model)
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargs,
                                     PlaceholderRange)
 from vllm.multimodal.utils import group_mm_inputs_by_modality
-from vllm.pooling_params import PoolingTask
 from vllm.sequence import IntermediateTensors
+from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
 from vllm.utils import (LayerBlockType, cdiv, is_pin_memory_available,
                         prev_power_of_2)
 from vllm.v1.attention.backends.pallas import (TPU_STR_DTYPE_TO_TORCH_DTYPE,
@@ -489,6 +491,21 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     def get_model(self) -> nn.Module:
         return self.model
 
+    def get_supported_generation_tasks(self) -> list[GenerationTask]:
+        model = self.get_model()
+        supported_tasks = list[GenerationTask]()
+
+        if is_text_generation_model(model):
+            supported_tasks.append("generate")
+
+        if supports_transcription(model):
+            if model.supports_transcription_only:
+                return ["transcription"]
+
+            supported_tasks.append("transcription")
+
+        return supported_tasks
+
     def get_supported_pooling_tasks(self) -> list[PoolingTask]:
         model = self.get_model()
         if not is_pooling_model(model):
@@ -496,6 +513,16 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         return list(model.pooler.get_supported_tasks())
 
+    def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
+        tasks = list[SupportedTask]()
+
+        if self.model_config.runner_type == "generate":
+            tasks.extend(self.get_supported_generation_tasks())
+        if self.model_config.runner_type == "pooling":
+            tasks.extend(self.get_supported_pooling_tasks())
+
+        return tuple(tasks)
+
     def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         """
         Generates the KVCacheSpec by parsing the kv cache format from each
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 254b058d2cd32..72e0e4230a017 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -21,7 +21,7 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
-from vllm.pooling_params import PoolingTask
+from vllm.tasks import SupportedTask
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
 from vllm.v1.attention.backends.pallas import TPU_HEAD_SIZE_ALIGNMENT
 from vllm.v1.core.sched.output import SchedulerOutput
@@ -282,8 +282,8 @@ class TPUWorker:
     def get_model(self) -> nn.Module:
         return self.model_runner.get_model()
 
-    def get_supported_pooling_tasks(self) -> list[PoolingTask]:
-        return self.model_runner.get_supported_pooling_tasks()
+    def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
+        return self.model_runner.get_supported_tasks()
 
     def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         return self.model_runner.get_kv_cache_spec()
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index feca8a7a1e74f..7b8fe2f802d68 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -12,9 +12,11 @@ import torch.nn as nn
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.model_executor.models.interfaces_base import is_pooling_model
-from vllm.pooling_params import PoolingTask
+from vllm.model_executor.models.interfaces import supports_transcription
+from vllm.model_executor.models.interfaces_base import (
+    is_pooling_model, is_text_generation_model)
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
+from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
 
 if TYPE_CHECKING:
     from vllm.attention import AttentionMetadata
@@ -224,6 +226,21 @@ class ModelRunnerBase(ABC, Generic[T]):
     def get_model(self) -> nn.Module:
         raise NotImplementedError
 
+    def get_supported_generation_tasks(self) -> list[GenerationTask]:
+        model = self.get_model()
+        supported_tasks = list[GenerationTask]()
+
+        if is_text_generation_model(model):
+            supported_tasks.append("generate")
+
+        if supports_transcription(model):
+            if model.supports_transcription_only:
+                return ["transcription"]
+
+            supported_tasks.append("transcription")
+
+        return supported_tasks
+
     def get_supported_pooling_tasks(self) -> list[PoolingTask]:
         model = self.get_model()
         if not is_pooling_model(model):
@@ -231,6 +248,16 @@ class ModelRunnerBase(ABC, Generic[T]):
 
         return list(model.pooler.get_supported_tasks())
 
+    def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
+        tasks = list[SupportedTask]()
+
+        if self.model_config.runner_type == "generate":
+            tasks.extend(self.get_supported_generation_tasks())
+        if self.model_config.runner_type == "pooling":
+            tasks.extend(self.get_supported_pooling_tasks())
+
+        return tuple(tasks)
+
     def execute_model(
         self,
         model_input: T,

From f3a683b7c9df8b251092e48e53d58220bb920f2c Mon Sep 17 00:00:00 2001
From: Mengqing Cao <cmq0113@163.com>
Date: Fri, 25 Jul 2025 20:53:07 +0800
Subject: [PATCH 04/57] [Bugfix][Logprobs] Fix logprobs op to support more
 backend (#21591)

Signed-off-by: MengqingCao <cmq0113@163.com>
---
 vllm/v1/sample/ops/logprobs.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/sample/ops/logprobs.py b/vllm/v1/sample/ops/logprobs.py
index a4d65485140ec..82875b7c84522 100644
--- a/vllm/v1/sample/ops/logprobs.py
+++ b/vllm/v1/sample/ops/logprobs.py
@@ -4,8 +4,10 @@
 
 import torch
 
+from vllm.platforms import current_platform
 
-@torch.compile(dynamic=True)
+
+@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
 def batched_count_greater_than(x: torch.Tensor,
                                values: torch.Tensor) -> torch.Tensor:
     """

From c72f049cb4c98b92cb85fc791e74d9220a263cf2 Mon Sep 17 00:00:00 2001
From: xyxinyang <43821961+xyxinyang@users.noreply.github.com>
Date: Fri, 25 Jul 2025 21:02:53 +0800
Subject: [PATCH 05/57] [Model] Fix Ernie4.5MoE e_score_correction_bias
 parameter (#21586)

Signed-off-by: zhouchong <zhouchong03@baidu.com>
Co-authored-by: zhouchong <zhouchong03@baidu.com>
---
 vllm/model_executor/models/ernie45_moe.py | 25 +++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py
index 984003e62d11a..5824b0967e773 100644
--- a/vllm/model_executor/models/ernie45_moe.py
+++ b/vllm/model_executor/models/ernie45_moe.py
@@ -123,14 +123,19 @@ class Ernie4_5_MoeMoE(nn.Module):
                                      quant_config=None,
                                      prefix=f"{prefix}.gate")
 
-        self.experts = FusedMoE(num_experts=config.moe_num_experts,
-                                top_k=config.moe_k,
-                                hidden_size=config.hidden_size,
-                                intermediate_size=config.moe_intermediate_size,
-                                reduce_results=False,
-                                renormalize=True,
-                                quant_config=quant_config,
-                                prefix=f"{prefix}.experts")
+        self.gate.e_score_correction_bias = nn.Parameter(
+            torch.empty(config.moe_num_experts))
+
+        self.experts = FusedMoE(
+            num_experts=config.moe_num_experts,
+            top_k=config.moe_k,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+            e_score_correction_bias=self.gate.e_score_correction_bias)
 
         if self.moe_num_shared_experts is not None:
             intermediate_size = (config.moe_intermediate_size *
@@ -459,6 +464,10 @@ class Ernie4_5_MoeModel(nn.Module):
             if "mtp" in name:
                 continue
 
+            if "e_score_correction_bias" in name:
+                name = name.replace("moe_statics", "gate")
+                loaded_weight = loaded_weight.squeeze(0)
+
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:

From 29c6fbe58cfa705c26ed1b38f262d5ade0b4f9ba Mon Sep 17 00:00:00 2001
From: bigshanedogg <bigshane319@gmail.com>
Date: Fri, 25 Jul 2025 22:05:42 +0900
Subject: [PATCH 06/57] [MODEL] New model support for
 naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B (#20931)

Signed-off-by: bigshanedogg <bigshane319@gmail.com>
---
 docs/models/supported_models.md               |    1 +
 examples/offline_inference/vision_language.py |   80 ++
 .../vision_language_multi_image.py            |   48 +
 .../multimodal/processing/test_common.py      |    1 +
 tests/models/registry.py                      |    3 +
 .../models/hyperclovax_vision.py              | 1231 +++++++++++++++++
 vllm/model_executor/models/registry.py        |    1 +
 7 files changed, 1365 insertions(+)
 create mode 100644 vllm/model_executor/models/hyperclovax_vision.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 0143d137ff3f9..0f3b730eabedc 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -365,6 +365,7 @@ th {
 | `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ |
 | `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ |
 | `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ |
+| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | | ✅︎ |
 | `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index e4811c023377f..eb6b410848558 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -316,6 +316,85 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B
+def run_hyperclovax_seed_vision(
+    questions: list[str], modality: str
+) -> ModelRequestData:
+    model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192 if modality == "image" else 16384,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    messages = list()
+    for question in questions:
+        if modality == "image":
+            """
+            ocr: List the words in the image in raster order. 
+                Even if the word order feels unnatural for reading, 
+                the model will handle it as long as it follows raster order.
+                e.g. "Naver, CLOVA, bigshane"
+            lens_keywords: List the entity names in the image.
+                e.g. "iPhone"
+            lens_local_keywords: List the entity names with quads in the image.
+                e.g. "[0.07, 0.21, 0.92, 0.90] iPhone"
+            """
+            messages.append(
+                [
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image",
+                                "ocr": "",
+                                "lens_keywords": "",
+                                "lens_local_keywords": "",
+                            },
+                            {
+                                "type": "text",
+                                "text": question,
+                            },
+                        ],
+                    }
+                ]
+            )
+        elif modality == "video":
+            messages.append(
+                [
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "video",
+                            },
+                            {
+                                "type": "text",
+                                "text": question,
+                            },
+                        ],
+                    }
+                ]
+            )
+        else:
+            raise ValueError(f"Unsupported modality: {modality}")
+
+    prompts = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=None,
+    )
+
+
 # Idefics3-8B-Llama3
 def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1222,6 +1301,7 @@ model_example_map = {
     "glm4v": run_glm4v,
     "glm4_1v": run_glm4_1v,
     "h2ovl_chat": run_h2ovl,
+    "hyperclovax_seed_vision": run_hyperclovax_seed_vision,
     "idefics3": run_idefics3,
     "internvl_chat": run_internvl,
     "nemotron_vl": run_nemotron_vl,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index eb4f3b6c8f449..2e14fc807e104 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -289,6 +289,53 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_hyperclovax_seed_vision(
+    question: str, image_urls: list[str]
+) -> ModelRequestData:
+    model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=16384,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    message = {"role": "user", "content": list()}
+    for _image_url in image_urls:
+        message["content"].append(
+            {
+                "type": "image",
+                "image": _image_url,
+                "ocr": "",
+                "lens_keywords": "",
+                "lens_local_keywords": "",
+            }
+        )
+    message["content"].append(
+        {
+            "type": "text",
+            "text": question,
+        }
+    )
+
+    prompt = tokenizer.apply_chat_template(
+        [
+            message,
+        ],
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        stop_token_ids=None,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
     # NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
     # it will generate poor response for multi-image inputs!
@@ -900,6 +947,7 @@ model_example_map = {
     "h2ovl_chat": load_h2ovl,
     "idefics3": load_idefics3,
     "internvl_chat": load_internvl,
+    "hyperclovax_seed_vision": load_hyperclovax_seed_vision,
     "keye_vl": load_keye_vl,
     "kimi_vl": load_kimi_vl,
     "llava": load_llava,
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index fd5842523178f..c2e9a73fa82f0 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -278,6 +278,7 @@ def _test_processing_correctness_one(
     "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
     "moonshotai/Kimi-VL-A3B-Instruct",
     "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
     "llava-hf/llava-1.5-7b-hf",
     "llava-hf/llava-v1.6-mistral-7b-hf",
     "llava-hf/LLaVA-NeXT-Video-7B-hf",
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 3b92462e58a85..1800262ced67f 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -201,6 +201,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                                trust_remote_code=True),
     "HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124",
                                                trust_remote_code=True),
+    "HCXVisionForCausalLM": _HfExamplesInfo(
+        "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
+        trust_remote_code=True),
     "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
                                            trust_remote_code=True),
     "InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",
diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py
new file mode 100644
index 0000000000000..3e8e50b35c0b7
--- /dev/null
+++ b/vllm/model_executor/models/hyperclovax_vision.py
@@ -0,0 +1,1231 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# copied from : https://github.com/huggingface/transformers
+import ast
+import sys
+from collections import defaultdict
+from collections.abc import Iterable, Mapping, Sequence
+from functools import partial
+from itertools import chain
+from typing import Any, Literal, Optional, TypedDict, Union
+
+import numpy as np
+import PIL
+from einops import rearrange
+from PIL import Image
+
+if sys.version_info >= (3, 11):
+    import typing
+    Unpack = typing.Unpack
+else:
+    import typing_extensions
+    Unpack = typing_extensions.Unpack
+
+import torch
+import torch.nn as nn
+from timm.layers import LayerNorm, LayerNorm2d
+from timm.models.regnet import RegStage
+from transformers import (AutoProcessor, BatchFeature, CLIPVisionConfig,
+                          SiglipVisionConfig)
+from transformers.modeling_utils import no_init_weights
+
+from vllm.config import VllmConfig
+from vllm.inputs import InputProcessingContext
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, ProcessingCache,
+                                        PromptReplacement, PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .clip import CLIPVisionModel
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .siglip import SiglipVisionModel
+from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
+from .vision import get_vision_encoder_info
+
+EOT = "<|endofturn|>"
+IMAGE_TOKEN: str = "<|dummy3|>"
+VIDEO_TOKEN: str = "<|_unuse_missing_100270|>"
+
+
+class HCXVisionMultimodalPixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values_images: list[torch.Tensor]
+    """
+    Shape: `[(num_grids, num_channels, height, width), ...]` if anyres
+    
+    Note that `height` or `width` may be different per batch and image,
+    in which case the data is passed as a list instead of a batched tensor.
+    """
+    image_sizes_images: list[tuple[Union[int, float]]]
+    """
+    Shape: `[(height, width), ...]`
+    """
+    vision_query_lengths_images: list[Union[int, float]]
+    pixel_values_videos: list[tuple[Union[int, float]]]
+    """
+    Shape: `[(num_grids, num_channels, height, width), ...]` if anyres
+    """
+    vision_query_lengths_videos: list[Union[int, float]]
+
+
+HCXVisionMultimodalInputs = Union[HCXVisionMultimodalPixelInputs]
+
+
+class HCXVisionProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def get_vision_encoder_info(self):
+        return get_vision_encoder_info(self.get_hf_config())
+
+    def get_hf_processor(
+        self,
+        **kwargs: object,
+    ):
+        processor_cls = type(
+            AutoProcessor.from_pretrained(
+                self.ctx.model_config.model,
+                trust_remote_code=self.ctx.model_config.trust_remote_code,
+            ))
+        return self.ctx.get_hf_processor(
+            processor_cls,
+            **kwargs,
+        )
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        vision_query_length: Union[int, list[int]],
+    ) -> int:
+        if isinstance(vision_query_length, int):
+            return vision_query_length
+        else:
+            return sum(vision_query_length)
+
+    def get_num_video_tokens(
+        self,
+        *,
+        vision_query_length: Union[int, list[int]],
+    ) -> int:
+        if isinstance(vision_query_length, int):
+            return vision_query_length
+        else:
+            return sum(vision_query_length)
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        vision_encoder_info = self.get_vision_encoder_info()
+        width = height = vision_encoder_info.get_image_size()
+        return ImageSize(width=width, height=height)
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+        )
+
+
+class HCXVisionDummyInputsBuilder(
+        BaseDummyInputsBuilder[HCXVisionProcessingInfo]):
+
+    def get_dummy_text(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> str:
+        dummy_text = IMAGE_TOKEN * mm_counts.get(
+            "image", 0) + VIDEO_TOKEN * mm_counts.get("video", 0)
+        return dummy_text
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = 32
+        return {
+            "image":
+            self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+            ),
+            "video":
+            self._get_dummy_videos(
+                width=target_width - 1,
+                height=target_height - 1,
+                num_frames=target_num_frames,
+                num_videos=num_videos,
+            )
+        }
+
+
+class HCXVisionMultiModalProcessor(
+        BaseMultiModalProcessor[HCXVisionProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+
+        def replace_multimodal_token(
+            token_ids: torch.Tensor,
+            target_token: int,
+            repeats: list,
+        ):
+            output = list()
+            _repeats_idx = 0
+            for token_id in token_ids:
+                if token_id == target_token:
+                    output += [
+                        token_id.item(),
+                    ] * repeats[_repeats_idx]
+                    _repeats_idx += 1
+                else:
+                    output += [
+                        token_id.item(),
+                    ]
+            return torch.tensor(output, device=token_ids.device)
+
+        for video_idx, video_arr in enumerate(mm_data.get("videos", list())):
+            if video_arr.dtype == np.uint8:
+                continue
+            mm_data["videos"][video_idx] = video_arr.astype(np.uint8)
+
+        processed_outputs = self.info.ctx.call_hf_processor(
+            hf_processor=self.info.get_hf_processor(**mm_kwargs),
+            data=dict(
+                text=prompt,
+                images=None,
+                videos=None,
+            ),
+        )  # text-only
+
+        if len(mm_data) > 0:
+            # batchify input as a single item
+            images = mm_data.get("images", None)
+            num_images = 0
+            if images is not None:
+                num_images = len(images)
+                images = [
+                    images,
+                ]  # batchify
+
+            videos = mm_data.get("videos",
+                                 None)  # list of video in single conversation
+            num_videos = 0
+            if videos is not None:
+                num_videos = len(videos)
+                videos = [
+                    videos,
+                ]  # batchify
+
+            _processed_outputs = self.info.ctx.call_hf_processor(
+                hf_processor=self.info.get_hf_processor(**mm_kwargs),
+                data=dict(
+                    text=None,
+                    images=images,
+                    videos=videos,
+                ),
+            )  # mm-only
+
+            for k, v in _processed_outputs.items():
+                if len(v) < 1:
+                    continue
+                elif k.endswith("_images"):
+                    # list of list of 4D tensor -> list of 4D tensor
+                    _processed_outputs[k] = v[0]
+                elif k.endswith("_videos"):
+                    # list of list of 4D tensor -> list of 4D tensor
+                    v = v[0]
+                    if k == "pixel_values_videos":
+                        v = torch.cat(v, dim=0)
+                        _c, _w, _h = v.shape[-3:]
+                        v = v.reshape(num_videos, -1, _c, _w, _h)
+                        v = list(torch.unbind(v, dim=0))
+                    _processed_outputs[k] = v
+
+            if num_images > 0:
+                tokenizer = self.info.get_tokenizer()
+                processed_outputs["input_ids"] = torch.stack([
+                    replace_multimodal_token(
+                        token_ids=_input_ids,
+                        target_token=tokenizer.convert_tokens_to_ids(
+                            IMAGE_TOKEN),
+                        repeats=_processed_outputs[
+                            "vision_query_lengths_images"],
+                    ) for _input_ids in processed_outputs["input_ids"]
+                ],
+                                                             dim=0)
+
+            if num_videos > 0:
+                tokenizer = self.info.get_tokenizer()
+                processed_outputs["input_ids"] = torch.stack([
+                    replace_multimodal_token(
+                        token_ids=_input_ids,
+                        target_token=tokenizer.convert_tokens_to_ids(
+                            VIDEO_TOKEN),
+                        repeats=_processed_outputs[
+                            "vision_query_lengths_videos"],
+                    ) for _input_ids in processed_outputs["input_ids"]
+                ],
+                                                             dim=0)
+
+                _ratios = [
+                    len(_pixel_values) for _pixel_values in
+                    _processed_outputs["pixel_values_videos"]
+                ]
+                _num_per_videos = [
+                    int(_e / sum(_ratios) *
+                        len(_processed_outputs["vision_query_lengths_videos"]))
+                    for _e in _ratios
+                ]
+                _processed_outputs["vision_query_lengths_videos"] = [
+                    _processed_outputs["vision_query_lengths_videos"]
+                    [sum(_num_per_videos[:_i]):sum(_num_per_videos[:_i + 1])]
+                    for _i in range(0, num_videos)
+                ]
+
+            processed_outputs.update(_processed_outputs)
+
+        return processed_outputs
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_config = self.info.get_hf_config()
+        placeholder = {
+            "image": hf_config.image_token_id,
+            "video": hf_config.video_token_id,
+        }
+
+        def get_replacement_hyperclovax(
+            item_idx: int,
+            modality: str,
+            out_mm_kwargs: MultiModalKwargs,
+        ):
+            num_tokens = None
+            if modality == "image":
+                num_tokens = self.info.get_num_image_tokens(
+                    vision_query_length=out_mm_kwargs[
+                        "vision_query_lengths_images"][item_idx], )
+            if modality == "video":
+                num_tokens = self.info.get_num_video_tokens(
+                    vision_query_length=out_mm_kwargs[
+                        "vision_query_lengths_videos"][item_idx], )
+            assert isinstance(num_tokens, int)
+            return [
+                placeholder[modality],
+            ] * num_tokens
+
+        return [
+            PromptReplacement(
+                modality=modality,
+                target=[
+                    placeholder[modality],
+                ],
+                replacement=partial(
+                    get_replacement_hyperclovax,
+                    modality=modality,
+                    out_mm_kwargs=out_mm_kwargs,
+                ),
+            ) for modality in ("image", "video")
+        ]
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            # image
+            pixel_values_images=MultiModalFieldConfig.batched("image"),
+            image_sizes_images=MultiModalFieldConfig.batched("image"),
+            vision_query_lengths_images=MultiModalFieldConfig.batched("image"),
+            num_queries_vis_abstractors_images=MultiModalFieldConfig.batched(
+                "image"),
+            num_queries_vis_abstractors_slow_images=MultiModalFieldConfig.
+            batched("image"),
+            first_last_frames_slows_images=MultiModalFieldConfig.batched(
+                "image"),
+            # video
+            pixel_values_videos=MultiModalFieldConfig.batched("video"),
+            image_sizes_videos=MultiModalFieldConfig.batched("video"),
+            vision_query_lengths_videos=MultiModalFieldConfig.batched("video"),
+            num_queries_vis_abstractors_videos=MultiModalFieldConfig.batched(
+                "video"),
+            num_queries_vis_abstractors_slow_videos=MultiModalFieldConfig.
+            batched("video"),
+            first_last_frames_slows_videos=MultiModalFieldConfig.batched(
+                "video"),
+        )
+
+
+def _build_hcxvision_hf_info(
+    ctx: InputProcessingContext, ) -> HCXVisionProcessingInfo:
+    return HCXVisionProcessingInfo(ctx)
+
+
+def _build_hcxvision_hf_processor(
+    info: HCXVisionProcessingInfo,
+    dummy_inputs: BaseDummyInputsBuilder[HCXVisionProcessingInfo],
+    *,
+    cache: Optional[ProcessingCache] = None,
+) -> BaseMultiModalProcessor:
+    if isinstance(info, HCXVisionProcessingInfo):
+        return HCXVisionMultiModalProcessor(
+            info,
+            dummy_inputs,  # type: ignore
+            cache=cache,
+        )
+
+    raise NotImplementedError(type(info))
+
+
+def init_vision_tower_for_hcxvision(
+    vision_config,
+    quant_config: Optional[QuantizationConfig],
+    *,
+    use_nth_layer: Optional[int] = None,
+    require_post_norm: Optional[bool] = None,
+    prefix: str = "",
+) -> Union[CLIPVisionModel, SiglipVisionModel]:
+    num_hidden_layers = vision_config.num_hidden_layers
+    if not isinstance(use_nth_layer, int):
+        pass
+    elif use_nth_layer >= 0:
+        num_hidden_layers = use_nth_layer + 1
+    else:
+        num_hidden_layers = num_hidden_layers + use_nth_layer + 1
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        return CLIPVisionModel(
+            vision_config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers,
+            require_post_norm=require_post_norm,
+            prefix=prefix,
+        )
+    elif isinstance(vision_config, SiglipVisionConfig):
+        return SiglipVisionModel(
+            vision_config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers,
+            require_post_norm=require_post_norm,
+            prefix=prefix,
+        )
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+class HCXVisionMlp(nn.Module):
+
+    def __init__(
+        self,
+        mm_projector_type,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.mm_projector_type = mm_projector_type
+        if self.mm_projector_type == "mlp":
+            self.fc1 = nn.Linear(in_features, hidden_features)
+            self.act = act_layer()
+            self.fc2 = nn.Linear(hidden_features, out_features)
+        elif self.mm_projector_type == "inverted_mlp":
+            self.fc1 = nn.Linear(in_features, 2 * hidden_features)
+            self.act = act_layer()
+            self.fc2 = nn.Linear(2 * hidden_features, out_features)
+        else:
+            raise NotImplementedError("{} is not implemented".format(
+                self.mm_projector_type))
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        return x
+
+
+class HCXVisionCAbstractor(nn.Module):
+    """
+    This module is based on C-Abstractor, whose license is under apache-2.0.
+    You can check the original code at 
+    https://github.com/khanrc/honeybee/blob/main/honeybee/projectors/projectors.py
+    and we made necessary modifications.
+    """
+
+    def __init__(
+        self,
+        num_queries: int,
+        num_input_tokens: int,
+        encoder_hidden_size: int,
+        hidden_size: int,
+        output_hidden_size: int,
+        pos_emb: bool = True,
+        prenorm: bool = False,
+    ):
+        super().__init__()
+        self.num_input_tokens = num_input_tokens
+        self.output_hidden_size = output_hidden_size
+
+        # Positional embedding
+        if pos_emb:
+            self.pos_emb = torch.nn.Parameter(
+                torch.zeros(1, num_input_tokens, encoder_hidden_size))
+            self.pos_emb.data.normal_(mean=0.0, std=0.02)
+        else:
+            self.pos_emb = None
+
+        # (Optional) Pre-normalization layer
+        if prenorm:
+            self.prenorm = LayerNorm(encoder_hidden_size)
+        else:
+            self.prenorm = None
+
+        self.build_net(num_queries, encoder_hidden_size, hidden_size,
+                       output_hidden_size)
+        self.dtype = next(self.parameters()).dtype
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        num_queries_vis_abstractors: Optional[list[list[int]]] = None,
+        num_grids: Optional[list[int]] = None,
+    ) -> torch.Tensor:
+        if self.prenorm is not None:
+            x = self.prenorm(x)
+
+        if self.pos_emb is not None:
+            x = x + self.pos_emb
+
+        x = self._forward(
+            x,
+            num_queries_vis_abstractors=num_queries_vis_abstractors,
+            num_grids=num_grids,
+        )  # (B, L, output_hidden_size)
+
+        return x
+
+    def _forward(
+        self,
+        x: torch.Tensor,
+        num_queries_vis_abstractors: Optional[list[list[int]]] = None,
+        num_grids: Optional[list[int]] = None,
+    ) -> torch.Tensor:
+        # x: [B, L, dim]
+        B, L, dim = x.shape
+        hw = int(L**0.5)
+        x = rearrange(x, "b (h w) d -> b d h w", h=hw, w=hw)
+
+        if num_queries_vis_abstractors is not None:
+            assert num_grids is not None
+            return self._forward_adaptive_num_query(
+                x, num_queries_vis_abstractors, num_grids)
+
+        x = self.net(x)
+        x = rearrange(x, "b d h w -> b (h w) d")
+        x = self.readout(x)
+        return x
+
+    def _forward_adaptive_num_query(
+        self,
+        x: torch.Tensor,
+        num_queries_vis_abstractors: Optional[list[list[int]]] = None,
+        num_grids: Optional[list[int]] = None,
+    ) -> list[torch.Tensor]:
+        # self.net is consisted by 3 layers (s1, sampler, s2)
+        assert len(self.net) == 3
+
+        x = self.net[0](x)  # s1
+        new_x = []
+        for i, num_queries in enumerate(num_queries_vis_abstractors):
+            hw = int(num_queries**0.5)
+            sampler = nn.AdaptiveAvgPool2d((hw, hw))
+            out = sampler(x[num_grids[i]:num_grids[i + 1], :])
+            out = self.net[2](out)  # s2
+
+            out = rearrange(out, "b d h w -> b (h w) d")
+            out = self.readout(out)
+
+            new_x.append(out)
+        return new_x
+
+    def build_net(
+        self,
+        n_queries: int,
+        encoder_hidden_size: int,
+        hidden_size: int,
+        output_hidden_size: int,
+        depth: int = 3,
+        mlp_depth: int = 2,
+    ):
+        assert (n_queries**0.5).is_integer(
+        ), f"n_queries must be square number. n_queries: {n_queries}"
+        hw = int(n_queries**0.5)
+
+        # RegBlock = ResBlock + SE
+        RegBlock = partial(
+            RegStage,
+            stride=1,
+            dilation=1,
+            act_layer=nn.SiLU,
+            norm_layer=LayerNorm2d,
+        )
+
+        s1 = RegBlock(
+            depth,
+            encoder_hidden_size,
+            hidden_size,
+        )
+        sampler = nn.AdaptiveAvgPool2d((hw, hw))
+        s2 = RegBlock(
+            depth,
+            hidden_size,
+            hidden_size,
+        )
+
+        self.net = nn.Sequential(s1, sampler, s2)
+        self.readout = self.build_mlp(mlp_depth, hidden_size,
+                                      output_hidden_size)
+
+    def build_mlp(
+        self,
+        depth: int,
+        hidden_size: int,
+        output_hidden_size: int,
+    ):
+        layers = [nn.Linear(hidden_size, output_hidden_size)]
+        for _ in range(1, depth):
+            layers.append(nn.SiLU())
+            layers.append(nn.Linear(output_hidden_size, output_hidden_size))
+        return nn.Sequential(*layers)
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    _build_hcxvision_hf_processor,
+    info=_build_hcxvision_hf_info,
+    dummy_inputs=HCXVisionDummyInputsBuilder)
+class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
+
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
+    }
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        **kwargs: Optional[Any],
+    ) -> None:
+        super().__init__()
+
+        # init configs
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        # text_config
+        text_config = config.text_config
+        if text_config.model_type in ["gpt2", "hyperclovax", "llama"]:
+            text_config._attn_implementation = "sdpa"
+        if text_config.model_type != "hyperclovax":
+            text_config.logits_scaling = 1.0
+        # vision_config
+        vision_config = config.vision_config
+        vision_config.auto_map = {}
+        vision_config.anyres = config.anyres
+        vision_config.max_num_grids = config.max_num_grids
+        self.dtype = vllm_config.model_config.dtype
+
+        ## possible_resolution should be matched with preprocessor_config.json
+        config.possible_resolutions = self._init_possible_resolutions(
+            config, vision_config)
+
+        # init models & parameters
+        with no_init_weights():  # weight will be loaded in from_pretrained
+            self.vision_model = init_vision_tower_for_hcxvision(
+                vision_config,
+                quant_config,
+                use_nth_layer=getattr(config, "use_nth_layer", -1),
+                require_post_norm=False,
+                prefix=maybe_prefix(prefix, "vision_model"),
+            )
+        self.mm_projector = self._init_mm_projector(config, text_config,
+                                                    vision_config)
+
+        self.lm_head_vocab_size = getattr(text_config, "padded_vocab_size",
+                                          text_config.vocab_size)
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        if config.anyres:
+            self.image_newline = nn.Parameter(
+                torch.empty(text_config.hidden_size, dtype=self.dtype))
+
+        self.config = config
+        self.vision_config = vision_config
+        self.text_config = text_config
+
+        # use_sum_loss = bool(kwargs.pop("use_sum_loss", False))
+        # self.reduction = self._init_reduction_type(use_sum_loss)
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return IMAGE_TOKEN
+        if modality.startswith("video"):
+            return VIDEO_TOKEN
+
+        raise ValueError("Only image or video modality is supported")
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(
+        self,
+        **kwargs: Unpack[HCXVisionMultimodalInputs],
+    ) -> Optional[MultiModalEmbeddings]:
+
+        multimodal_embeddings = list()
+        if kwargs.get("pixel_values_images") is not None:
+            for _pixel_values_images, _image_sizes_images in zip(
+                    kwargs["pixel_values_images"],
+                    kwargs["image_sizes_images"]):
+                _pixel_values_images = _pixel_values_images.unsqueeze(dim=0)
+                _image_sizes_images = _image_sizes_images.unsqueeze(dim=0)
+                _len_pixel_values_images = [
+                    len(pixel_value) for pixel_value in _pixel_values_images
+                ]
+                if isinstance(_image_sizes_images, torch.Tensor):
+                    _image_sizes_images = _image_sizes_images.detach().cpu(
+                    ).tolist()
+                _multimodal_embeddings_images = self.forward_images(
+                    pixel_values_images=_pixel_values_images,
+                    image_sizes_images=_image_sizes_images,
+                    len_pixel_values_images=_len_pixel_values_images,
+                )
+                _multimodal_embeddings_images = torch.cat(
+                    _multimodal_embeddings_images, dim=0)
+                multimodal_embeddings.append(_multimodal_embeddings_images)
+
+        if kwargs.get("pixel_values_videos") is not None:
+            for _pixel_values_videos, _vision_query_lengths_videos in zip(
+                    kwargs["pixel_values_videos"],
+                    kwargs["vision_query_lengths_videos"]):
+                _len_pixel_values_videos = [
+                    len(_vision_query_lengths)
+                    for _vision_query_lengths in _vision_query_lengths_videos
+                ]
+                _c, _w, _h = _pixel_values_videos.shape[-3:]
+                _pixel_values_videos = _pixel_values_videos.reshape(
+                    sum(_len_pixel_values_videos), -1, _c, _w,
+                    _h).unsqueeze(dim=0)
+                _multimodal_embeddings_videos = self.forward_videos(
+                    pixel_values_videos=_pixel_values_videos,
+                    len_pixel_values_videos=_len_pixel_values_videos,
+                )
+                _multimodal_embeddings_videos = torch.cat(
+                    _multimodal_embeddings_videos, dim=0)
+                multimodal_embeddings.append(_multimodal_embeddings_videos)
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if (kwargs.get("pixel_values_images") is not None
+                or kwargs.get("pixel_values_videos")
+                is not None):  # v0 compatibility
+            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
+        if multimodal_embeddings is not None:
+            multimodal_embeddings = torch.cat(multimodal_embeddings, dim=0)
+            _mask_image = input_ids == self.config.image_token_id
+            _mask_video = input_ids == self.config.video_token_id
+            assert _mask_image.sum() + _mask_video.sum() == len(
+                multimodal_embeddings)
+
+            if multimodal_embeddings.dtype != inputs_embeds.dtype:
+                multimodal_embeddings = multimodal_embeddings.to(
+                    dtype=inputs_embeds.dtype)
+            if multimodal_embeddings.device != inputs_embeds.device:
+                multimodal_embeddings = multimodal_embeddings.to(
+                    device=inputs_embeds.device)
+
+            if _mask_image.sum() > 0:
+                inputs_embeds[
+                    _mask_image] = multimodal_embeddings[:sum(_mask_image)]
+            if _mask_video.sum() > 0:
+                inputs_embeds[_mask_video] = multimodal_embeddings[
+                    -sum(_mask_video):]
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings(input_ids=input_ids,
+                                                      **kwargs)
+            input_ids = None
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
+        return hidden_states
+
+    def forward_images(
+        self,
+        pixel_values_images: list[list[torch.FloatTensor]],
+        image_sizes_images: list[list[tuple[int, int]]],
+        len_pixel_values_images: list[int],
+    ) -> list[list[torch.Tensor]]:
+        if sum(len_pixel_values_images) == 0:
+            return None
+
+        concat_pixel_values_images = torch.cat(list(
+            chain(*pixel_values_images)),
+                                               dim=0)
+
+        visual_token_idx = 0 if "siglip" in self.vision_config.model_type else 1
+        image_forward_outs = self.vision_model(
+            concat_pixel_values_images)[:, visual_token_idx:]
+
+        image_forward_outs = image_forward_outs.to(
+            dtype=self.mm_projector.dtype)
+        image_forward_outs = self.mm_projector(image_forward_outs)  # b (h w) d
+
+        split_sizes = [
+            pixel_value.shape[0] for pixel_value in chain(*pixel_values_images)
+        ]
+        image_forward_outs = torch.split(image_forward_outs,
+                                         split_sizes,
+                                         dim=0)
+
+        # newline for anyres postprocessing
+        image_features = anyres_postprocessing(
+            image_forward_outs=image_forward_outs,
+            image_sizes=[
+                image_size for image_sizes in image_sizes_images
+                for image_size in image_sizes
+            ],
+            num_queries_vis_abstractor=self.config.
+            num_queries_vis_abstractor_image,
+            unpad=self.config.unpad,
+            patch_size=self.vision_config.patch_size,
+            grid_size=self.vision_config.image_size,
+            image_newline=self.image_newline,
+            possible_resolutions=self.config.possible_resolutions,
+        )
+        return image_features
+
+    def forward_videos(
+        self,
+        pixel_values_videos: list[list[torch.FloatTensor]],
+        len_pixel_values_videos: list[int],
+    ) -> list[torch.Tensor]:
+
+        len_video_grids = sum(len_pixel_values_videos)
+        if len_video_grids == 0:
+            return None
+
+        # Run Vision Model
+        concat_pixel_values_videos = torch.cat(list(
+            chain(*pixel_values_videos)),
+                                               dim=0)
+
+        visual_token_idx = 0 if "siglip" in self.vision_config.model_type else 1
+        video_forward_outs = self.vision_model(
+            concat_pixel_values_videos)[:, visual_token_idx:]
+
+        video_forward_outs = video_forward_outs.to(
+            dtype=self.mm_projector.dtype)
+
+        # Run MM-Projector
+        # len(num_grids) == len(num_queries_vis_abstractors) + 1
+        grid_idx = 0
+        num_grids = [
+            grid_idx
+        ]  # e.g. [0, 9, 18, 19, 27, 28, 36, 37, 45, 46, 54, 55, 56]
+        num_queries_vis_abstractors = [
+        ]  # e.g. [81, 81, 81, 9, 81, 9, 81, 9, 81, 9, 81, 9]
+        len_total_frames = video_forward_outs.shape[0]
+
+        if self.config.first_last_frames_slow:
+            # slowfast (first_last_frames_slow)
+            assert len_total_frames != 0
+            if len_total_frames <= 2:
+                num_queries_vis_abstractors.append(
+                    self.config.num_queries_vis_abstractor_video_slow)
+                grid_idx += len_total_frames
+                num_grids.append(grid_idx)
+            else:
+                num_queries_vis_abstractors.append(
+                    self.config.num_queries_vis_abstractor_video_slow)
+                grid_idx += 1
+                num_grids.append(grid_idx)
+
+                num_queries_vis_abstractors.append(
+                    self.config.num_queries_vis_abstractor_video_fast)
+                grid_idx += len_total_frames - 2
+                num_grids.append(grid_idx)
+
+                num_queries_vis_abstractors.append(
+                    self.config.num_queries_vis_abstractor_video_slow)
+                grid_idx += 1
+                num_grids.append(grid_idx)
+        else:
+            # slowfast
+            for pixel_values_frames in pixel_values_videos:
+                for pixel_values_frame in pixel_values_frames:
+                    if len(pixel_values_frame) > 0:
+                        num_queries_vis_abstractors.append(
+                            self.config.num_queries_vis_abstractor_video_slow)
+                        grid_idx += 1
+                        num_grids.append(grid_idx)
+                        num_queries_vis_abstractors.append(
+                            self.config.num_queries_vis_abstractor_video_fast)
+                        grid_idx = grid_idx + len(pixel_values_frame) - 1
+                        num_grids.append(grid_idx)
+
+        video_forward_outs = self.mm_projector(video_forward_outs,
+                                               num_queries_vis_abstractors,
+                                               num_grids)
+
+        video_features = []  # what we want to return
+        target_features = []
+        target_group_size = 0
+        group_counter = 0
+        video_groups = [
+            len(frame) for frames in pixel_values_videos for frame in frames
+        ]  # for concat video features after projector
+
+        for forward_out in video_forward_outs:
+            target_group_size += len(forward_out)
+            target_features.append(forward_out.flatten(0, 1))
+
+            video_group_size = video_groups[group_counter]
+            if video_group_size == target_group_size:
+                video_features.append(torch.cat(target_features, dim=0))
+                target_features = []
+                group_counter += 1
+                target_group_size = 0
+
+            elif video_group_size < target_group_size:
+                raise RuntimeError(f"video_group_size < target_group_size!! \
+                        [{video_group_size} < {target_group_size}]")
+
+        assert len(target_features
+                   ) == 0, f"target_features is not empty!! {target_features}"
+        assert len(video_groups) == len(video_features)
+
+        return video_features
+
+    def _prepare_multimodal_kwargs(self, **kwargs: object):
+        output = defaultdict(list)
+        for k, v in kwargs.items():
+            if len(v) < 1 or len(v[0]) < 1:
+                continue  # if empty batch of empty sample
+
+            new_k, is_video = k, False
+            if (not k.endswith("_images") and not k.endswith("_videos")):
+                pass
+            else:
+                new_k, is_video = k.split("_")[:-1], k.split("_")[-1]
+                new_k = "_".join(new_k)
+                is_video = is_video == "videos"
+
+            for _sample_idx, _v in enumerate(v):  # batch -> sample
+                if new_k not in ["pixel_values"]:
+                    if len(output[new_k]) < _sample_idx + 1:
+                        output[new_k].append(list())
+                    _v = _v.detach().cpu().numpy().tolist()
+                    output[new_k][_sample_idx] += _v
+                elif isinstance(_v, torch.Tensor):
+                    if len(output[new_k]) < _sample_idx + 1:
+                        output[new_k].append(list())
+                        output["is_videos"].append(list())
+                    _v = list(torch.unbind(_v, dim=0))
+                    output[new_k][_sample_idx] += _v
+                    output["is_videos"][_sample_idx] += [
+                        is_video,
+                    ] * len(_v)
+        return dict(output)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(
+        self,
+        weights: Iterable[tuple[str, torch.Tensor]],
+    ) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def _init_possible_resolutions(
+        self,
+        config,
+        vision_config,
+    ):
+        if not getattr(config, "possible_resolutions", []):
+            possible_resolutions = []
+            if config.anyres:
+                assert config.max_num_grids > 0
+                for i in range(1, config.max_num_grids + 1):
+                    for j in range(1, config.max_num_grids + 1):
+                        if i == 1 and j == 1 and not config.use_1x1_grid:
+                            continue
+                        if i * j <= config.max_num_grids:
+                            possible_resolutions.append([i, j])
+
+                possible_resolutions = [[
+                    ys * vision_config.image_size,
+                    xs * vision_config.image_size
+                ] for ys, xs in possible_resolutions]
+            return possible_resolutions
+        else:
+            return config.possible_resolutions
+
+    def _init_mm_projector(
+        self,
+        config,
+        text_config,
+        vision_config,
+    ):
+        input_hidden_size = vision_config.hidden_size
+        if config.mm_projector_type == "linear":
+            mm_projector = nn.Linear(input_hidden_size,
+                                     text_config.hidden_size)
+            mm_projector.dtype = next(mm_projector.parameters()).dtype
+        elif config.mm_projector_type == "cabstractor":
+            mm_projector = HCXVisionCAbstractor(
+                num_queries=config.num_queries_vis_abstractor_image,
+                num_input_tokens=(vision_config.image_size //
+                                  vision_config.patch_size)**2,
+                encoder_hidden_size=input_hidden_size,
+                hidden_size=input_hidden_size,
+                output_hidden_size=text_config.hidden_size,
+                pos_emb=config.proj_pos_emb,
+                prenorm=config.proj_prenorm,
+            )
+        else:
+            mm_projector = HCXVisionMlp(
+                config.mm_projector_type,
+                input_hidden_size,
+                hidden_features=input_hidden_size,
+                out_features=self.text_config.hidden_size,
+            )
+        return mm_projector
+
+
+def unpad_image(tensor: torch.Tensor,
+                original_size: tuple[int, int]) -> torch.Tensor:
+    original_width, original_height = original_size
+    current_height, current_width = tensor.shape[1:]
+
+    original_aspect_ratio = original_width / original_height
+    current_aspect_ratio = current_width / current_height
+
+    if original_aspect_ratio > current_aspect_ratio:
+        scale_factor = current_width / original_width
+        new_height = int(original_height * scale_factor)
+        padding = (current_height - new_height) // 2
+        unpadded_tensor = tensor[:, padding:current_height - padding, :]
+    else:
+        scale_factor = current_height / original_height
+        new_width = int(original_width * scale_factor)
+        padding = (current_width - new_width) // 2
+        unpadded_tensor = tensor[:, :, padding:current_width - padding]
+
+    return unpadded_tensor
+
+
+def select_best_resolution(original_size: tuple,
+                           possible_resolutions: list) -> tuple:
+    original_height, original_width = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float("inf")
+
+    for height, width in possible_resolutions:
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(
+            original_height * scale)
+        effective_resolution = min(downscaled_width * downscaled_height,
+                                   original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+
+        if effective_resolution > max_effective_resolution or (
+                effective_resolution == max_effective_resolution
+                and wasted_resolution < min_wasted_resolution):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (height, width)
+
+    return best_fit
+
+
+def get_anyres_image_grid_shape(
+    image_size: tuple[int, int],
+    grid_pinpoints: Union[str, list[tuple[int, int]]],
+    patch_size: int,
+) -> tuple[int, int]:
+    possible_resolutions = grid_pinpoints if isinstance(
+        grid_pinpoints, list) else ast.literal_eval(grid_pinpoints)
+
+    original_width, original_height = image_size
+    height, width = select_best_resolution((original_height, original_width),
+                                           possible_resolutions)
+    return width // patch_size, height // patch_size
+
+
+def reshape_and_unpad_image_features(
+    image_feature: torch.Tensor,
+    height: int,
+    width: int,
+    image_size: tuple[int, int],
+    possible_resolutions: list[tuple[int, int]],
+    grid_size: int,
+    unpad: bool,
+    image_newline: torch.Tensor,
+) -> torch.Tensor:
+    base_image_feature = image_feature[0]
+    image_feature = image_feature[1:]
+
+    assert (height * width == base_image_feature.shape[0]
+            ), f"height: {height}, width: {width}, \
+        base_image_feature.shape[0]: {base_image_feature.shape[0]}"
+
+    num_patch_width, num_patch_height = get_anyres_image_grid_shape(
+        image_size, possible_resolutions, grid_size)
+    image_feature = image_feature.view(num_patch_height, num_patch_width,
+                                       height, width, -1)
+
+    if unpad:
+        image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+        image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+        image_feature = unpad_image(image_feature, image_size)
+        image_feature = torch.cat(
+            (
+                image_feature,
+                image_newline[:, None, None].expand(
+                    *image_feature.shape[:-1], 1).to(image_feature.device),
+            ),
+            dim=-1,
+        )
+        image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+    else:
+        image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
+        image_feature = image_feature.flatten(0, 3)
+    image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+
+    return image_feature
+
+
+def anyres_postprocessing(
+    image_forward_outs: list[torch.FloatTensor],
+    image_sizes: list[list[int]],
+    possible_resolutions: list[tuple[int, int]],
+    patch_size: int,
+    grid_size: int,
+    image_newline: torch.FloatTensor,
+    num_queries_vis_abstractor: int = -1,
+    unpad: bool = False,
+) -> list[torch.FloatTensor]:
+    height = width = grid_size // patch_size
+
+    if num_queries_vis_abstractor > 0:
+        assert (num_queries_vis_abstractor**0.5
+                ).is_integer(), "n_queries must be square number"
+        height = width = int(num_queries_vis_abstractor**0.5)
+
+    # post-processing (unpad, add newline)
+    new_image_features = []
+    for image_idx, image_feature in enumerate(image_forward_outs):
+        if image_feature.shape[0] > 1:
+            image_feature = reshape_and_unpad_image_features(
+                image_feature=image_feature,
+                height=height,
+                width=width,
+                image_size=image_sizes[image_idx],
+                possible_resolutions=possible_resolutions,
+                grid_size=grid_size,  # Pass grid info if needed by helper
+                unpad=unpad,
+                image_newline=image_newline,
+            )
+        else:
+            image_feature = image_feature[0]
+            image_feature = torch.cat(
+                (image_feature, image_newline[None].to(image_feature.device)),
+                dim=0)
+        new_image_features.append(image_feature)
+    image_features = new_image_features
+    return image_features
+
+
+def resize_image(
+    image: Union[np.ndarray, PIL.Image.Image],
+    max_side: int = 378,
+) -> np.ndarray:
+    image_arr = image
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+
+    width, height = image.size
+    cur_max_size = max(width, height)
+    if cur_max_size <= max_side:
+        return image_arr
+
+    scale = max_side / cur_max_size
+    width = int(width * scale)
+    height = int(height * scale)
+    image = image.resize((width, height), Image.LANCZOS)
+    image_arr = np.array(image)
+    return image_arr
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 7470b31e1253d..14a8ac7876f73 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -81,6 +81,7 @@ _TEXT_GENERATION_MODELS = {
     "Grok1ModelForCausalLM": ("grok1", "Grok1ForCausalLM"),
     "HunYuanMoEV1ForCausalLM": ("hunyuan_v1", "HunYuanMoEV1ForCausalLM"),
     "HunYuanDenseV1ForCausalLM": ("hunyuan_v1", "HunYuanDenseV1ForCausalLM"),
+    "HCXVisionForCausalLM": ("hyperclovax_vision", "HCXVisionForCausalLM"),
     "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
     "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
     "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"),

From 9fe98d42508f9a67d5b1c491a4a306966ded3976 Mon Sep 17 00:00:00 2001
From: kourosh hakhamaneshi <31483498+kouroshHakha@users.noreply.github.com>
Date: Fri, 25 Jul 2025 06:49:11 -0700
Subject: [PATCH 07/57] [Frontend] Add request_id to the Request object so they
 can be controlled better via external load balancers (#21009)

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 vllm/entrypoints/openai/protocol.py           | 21 +++++++++++++++++++
 vllm/entrypoints/openai/serving_completion.py |  4 +++-
 vllm/entrypoints/openai/serving_embedding.py  |  5 +++--
 3 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 6c6ec207a3cac..b6b3bf3f530e3 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1007,6 +1007,13 @@ class CompletionRequest(OpenAIBaseModel):
             "default: 0). Any priority other than 0 will raise an error "
             "if the served model does not use priority scheduling."),
     )
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."),
+    )
     logits_processors: Optional[LogitsProcessors] = Field(
         default=None,
         description=(
@@ -1251,6 +1258,13 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
             "default: 0). Any priority other than 0 will raise an error "
             "if the served model does not use priority scheduling."),
     )
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."),
+    )
 
     # --8<-- [end:embedding-extra-params]
 
@@ -1302,6 +1316,13 @@ class EmbeddingChatRequest(OpenAIBaseModel):
             "default: 0). Any priority other than 0 will raise an error "
             "if the served model does not use priority scheduling."),
     )
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."),
+    )
     # --8<-- [end:chat-embedding-extra-params]
 
     @model_validator(mode="before")
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 323795ca4372d..22c6b6250394c 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -113,7 +113,9 @@ class OpenAIServingCompletion(OpenAIServing):
             return self.create_error_response(
                 "Echo is unsupported with prompt embeds.")
 
-        request_id = f"cmpl-{self._base_request_id(raw_request)}"
+        request_id = (
+            f"cmpl-"
+            f"{self._base_request_id(raw_request, request.request_id)}")
         created_time = int(time.time())
 
         request_metadata = RequestResponseMetadata(request_id=request_id)
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 697f43c018b27..84ba00873103d 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -163,8 +163,9 @@ class OpenAIServingEmbedding(EmbeddingMixin):
         for the API specification. This API mimics the OpenAI Embedding API.
         """
         model_name = self._get_model_name(request.model)
-        request_id = (f"{self.request_id_prefix}-"
-                      f"{self._base_request_id(raw_request)}")
+        request_id = (
+            f"{self.request_id_prefix}-"
+            f"{self._base_request_id(raw_request, request.request_id)}")
 
         ctx = EmbeddingServeContext(
             request=request,

From eab2f3980cd30132dd99e23ea8d6fc0274365757 Mon Sep 17 00:00:00 2001
From: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com>
Date: Fri, 25 Jul 2025 09:49:36 -0400
Subject: [PATCH 08/57] [Model] Replace Mamba2 RMSNorm Gated with Fused Triton
 Kernel (#20839)

Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
Signed-off-by: Yu Chin Fabian Lim <fabian.lim@gmail.com>
Signed-off-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com>
Co-authored-by: Yu Chin Fabian Lim <fabian.lim@gmail.com>
---
 .../layers/mamba/mamba_mixer2.py              |  21 +--
 .../layers/mamba/ops/layernorm_gated.py       | 168 ++++++++++++++++++
 2 files changed, 176 insertions(+), 13 deletions(-)
 create mode 100644 vllm/model_executor/layers/mamba/ops/layernorm_gated.py

diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index e32b2be4d40e7..2c95099e53ad6 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -24,6 +24,7 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
     extra_groups_for_head_shards, get_mamba_state_shape)
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
     causal_conv1d_fn, causal_conv1d_update)
+from vllm.model_executor.layers.mamba.ops.layernorm_gated import rms_norm_gated
 from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
     selective_state_update)
 from vllm.model_executor.layers.mamba.ops.ssd_combined import (
@@ -133,21 +134,15 @@ class Mixer2RMSNormGated(CustomOp):
             return x * nn.functional.silu(gate.to(
                 torch.float32)).to(input_dtype)
 
-        if self.tp_size > 1 or self.n_groups != 1:
+        if (((self.n_groups % self.tp_size) != 0) or self.n_groups != 1):
             return self.forward_native(x, gate)
 
-        from vllm import _custom_ops as ops
-
-        # cast x and gate to float32 before silu
-        out = torch.empty_like(x)
-        y = x * nn.functional.silu(gate.to(torch.float32))
-        ops.rms_norm(
-            out,
-            y.to(x.dtype),
-            self.weight.data,
-            self.variance_epsilon,
-        )
-        return out
+        return rms_norm_gated(x,
+                              self.weight.data,
+                              bias=None,
+                              z=gate,
+                              eps=self.variance_epsilon,
+                              norm_before_gate=False)
 
 
 def mamba_v2_sharded_weight_loader(
diff --git a/vllm/model_executor/layers/mamba/ops/layernorm_gated.py b/vllm/model_executor/layers/mamba/ops/layernorm_gated.py
new file mode 100644
index 0000000000000..f3a45ab097c34
--- /dev/null
+++ b/vllm/model_executor/layers/mamba/ops/layernorm_gated.py
@@ -0,0 +1,168 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright (c) 2024, Tri Dao.
+# Adapted from https://github.com/state-spaces/mamba/blob/60dadf2e0ee730ac337035d5533de10bc26e4847/mamba_ssm/ops/triton/layernorm_gated.py
+
+import torch
+
+from vllm.triton_utils import tl, triton
+
+
+@triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
+@triton.heuristics({"HAS_Z": lambda args: args["Z"] is not None})
+@triton.jit
+def _layer_norm_fwd_1pass_kernel(
+    X,  # pointer to the input
+    Y,  # pointer to the output
+    W,  # pointer to the weights
+    B,  # pointer to the biases
+    Z,  # pointer to the other branch
+    Mean,  # pointer to the mean
+    Rstd,  # pointer to the 1/std
+    stride_x_row: tl.int64,
+    stride_y_row: tl.int64,
+    stride_z_row: tl.int64,
+    M: tl.int64,  # number of rows in X
+    N: tl.int64,  # number of columns in X
+    eps,  # epsilon to avoid division by zero
+    BLOCK_N: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+    HAS_Z: tl.constexpr,
+    NORM_BEFORE_GATE: tl.constexpr,
+    IS_RMS_NORM: tl.constexpr,
+):
+    # Map the program id to the row of X and Y it should compute.
+    row = tl.program_id(0)
+    group = tl.program_id(1)
+    X += row * stride_x_row + group * N
+    Y += row * stride_y_row + group * N
+    if HAS_Z:
+        Z += row * stride_z_row + group * N
+    if not IS_RMS_NORM:
+        Mean += group * M
+    Rstd += group * M
+    W += group * N
+    if HAS_BIAS:
+        B += group * N
+    # Compute mean and variance
+    cols = tl.arange(0, BLOCK_N)
+    x = tl.load(X + cols, mask=cols < N, other=0.).to(tl.float32)
+    if HAS_Z and not NORM_BEFORE_GATE:
+        z = tl.load(Z + cols, mask=cols < N).to(tl.float32)
+        x *= z * tl.sigmoid(z)
+    if not IS_RMS_NORM:
+        mean = tl.sum(x, axis=0) / N
+        tl.store(Mean + row, mean)
+        xbar = tl.where(cols < N, x - mean, 0.)
+        var = tl.sum(xbar * xbar, axis=0) / N
+    else:
+        xbar = tl.where(cols < N, x, 0.)
+        var = tl.sum(xbar * xbar, axis=0) / N
+    rstd = 1 / tl.sqrt(var + eps)
+    tl.store(Rstd + row, rstd)
+    # Normalize and apply linear transformation
+    mask = cols < N
+    w = tl.load(W + cols, mask=mask).to(tl.float32)
+    if HAS_BIAS:
+        b = tl.load(B + cols, mask=mask).to(tl.float32)
+    x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
+    y = x_hat * w + b if HAS_BIAS else x_hat * w
+    if HAS_Z and NORM_BEFORE_GATE:
+        z = tl.load(Z + cols, mask=mask).to(tl.float32)
+        y *= z * tl.sigmoid(z)
+    # Write output
+    tl.store(Y + cols, y, mask=mask)
+
+
+def _layer_norm_fwd(x,
+                    weight,
+                    bias,
+                    eps,
+                    z=None,
+                    out=None,
+                    group_size=None,
+                    norm_before_gate=True,
+                    is_rms_norm=False):
+    M, N = x.shape
+    if group_size is None:
+        group_size = N
+    assert N % group_size == 0
+    ngroups = N // group_size
+    assert x.stride(-1) == 1
+    if z is not None:
+        assert z.stride(-1) == 1
+        assert z.shape == (M, N)
+    assert weight.shape == (N, )
+    assert weight.stride(-1) == 1
+    if bias is not None:
+        assert bias.stride(-1) == 1
+        assert bias.shape == (N, )
+    # allocate output
+    if out is not None:
+        assert out.shape == x.shape
+    else:
+        out = torch.empty_like(x)
+    assert out.stride(-1) == 1
+    mean = torch.empty((ngroups * M, ), dtype=torch.float32,
+                       device=x.device) if not is_rms_norm else None
+    rstd = torch.empty((ngroups * M, ), dtype=torch.float32, device=x.device)
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size))
+    if group_size > BLOCK_N:
+        raise RuntimeError(
+            "This layer norm doesn't support feature dim >= 64KB.")
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK_N // 256, 1), 8)
+    grid = (M, ngroups)
+    with torch.cuda.device(x.device.index):
+        _layer_norm_fwd_1pass_kernel[grid](x,
+                                           out,
+                                           weight,
+                                           bias,
+                                           z,
+                                           mean,
+                                           rstd,
+                                           x.stride(0),
+                                           out.stride(0),
+                                           z.stride(0) if z is not None else 0,
+                                           M,
+                                           group_size,
+                                           eps,
+                                           BLOCK_N=BLOCK_N,
+                                           NORM_BEFORE_GATE=norm_before_gate,
+                                           IS_RMS_NORM=is_rms_norm,
+                                           num_warps=num_warps)
+    return out, mean, rstd
+
+
+def rms_norm_gated(x,
+                   weight,
+                   bias,
+                   z=None,
+                   eps=1e-6,
+                   group_size=None,
+                   norm_before_gate=True):
+    x_shape_og = x.shape
+    # reshape input data into 2D tensor
+    x = x.reshape(-1, x.shape[-1])
+    if x.stride(-1) != 1:
+        x = x.contiguous()
+    if z is not None:
+        assert z.shape == x_shape_og
+        z = z.reshape(-1, z.shape[-1])
+        if z.stride(-1) != 1:
+            z = z.contiguous()
+    weight = weight.contiguous()
+    if bias is not None:
+        bias = bias.contiguous()
+    y, _, _ = _layer_norm_fwd(x,
+                              weight,
+                              bias,
+                              eps,
+                              z=z,
+                              group_size=group_size,
+                              norm_before_gate=norm_before_gate,
+                              is_rms_norm=True)
+
+    return y.reshape(x_shape_og)

From b3caeb82e7407d5faa30c49aecd951df3dafd42c Mon Sep 17 00:00:00 2001
From: who who who <fsx950223@outlook.com>
Date: Fri, 25 Jul 2025 21:50:21 +0800
Subject: [PATCH 09/57] [ROCm][AITER] Enable fp8 kv cache on rocm aiter
 backend. (#20295)

Signed-off-by: fsx950223 <fsx950223@outlook.com>
Signed-off-by: amd-ruitang3 <Rui.Tang2@amd.com>
Co-authored-by: amd-ruitang3 <Rui.Tang2@amd.com>
---
 .../attention/test_aiter_flash_attn.py        | 191 +++++++++++++++
 vllm/v1/attention/backends/rocm_aiter_fa.py   | 225 ++++++++++--------
 2 files changed, 320 insertions(+), 96 deletions(-)
 create mode 100644 tests/kernels/attention/test_aiter_flash_attn.py

diff --git a/tests/kernels/attention/test_aiter_flash_attn.py b/tests/kernels/attention/test_aiter_flash_attn.py
new file mode 100644
index 0000000000000..d0687c62b1132
--- /dev/null
+++ b/tests/kernels/attention/test_aiter_flash_attn.py
@@ -0,0 +1,191 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import pytest
+import torch
+
+import vllm.v1.attention.backends.rocm_aiter_fa  # noqa: F401
+from vllm.platforms import current_platform
+
+NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
+HEAD_SIZES = [128, 256]
+BLOCK_SIZES = [16, 32]
+DTYPES = [torch.float16, torch.bfloat16]
+QDTYPES = [None]
+# one value large enough to test overflow in index calculation.
+# one value small enough to test the schema op check
+NUM_BLOCKS = [32768, 2048]
+
+
+def ref_paged_attn(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    query_lens: list[int],
+    kv_lens: list[int],
+    block_tables: torch.Tensor,
+    scale: float,
+    sliding_window: Optional[int] = None,
+    soft_cap: Optional[float] = None,
+) -> torch.Tensor:
+    num_seqs = len(query_lens)
+    block_tables = block_tables.cpu().numpy()
+    _, block_size, num_kv_heads, head_size = key_cache.shape
+
+    outputs: list[torch.Tensor] = []
+    start_idx = 0
+    for i in range(num_seqs):
+        query_len = query_lens[i]
+        kv_len = kv_lens[i]
+        q = query[start_idx:start_idx + query_len]
+        q *= scale
+
+        num_kv_blocks = (kv_len + block_size - 1) // block_size
+        block_indices = block_tables[i, :num_kv_blocks]
+
+        k = key_cache[block_indices].view(-1, num_kv_heads, head_size)
+        k = k[:kv_len]
+        v = value_cache[block_indices].view(-1, num_kv_heads, head_size)
+        v = v[:kv_len]
+
+        if q.shape[1] != k.shape[1]:
+            k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
+            v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
+        attn = torch.einsum("qhd,khd->hqk", q, k).float()
+        empty_mask = torch.ones(query_len, kv_len)
+        mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
+        if sliding_window is not None:
+            sliding_window_mask = torch.triu(empty_mask,
+                                             diagonal=kv_len -
+                                             (query_len + sliding_window) +
+                                             1).bool().logical_not()
+            mask |= sliding_window_mask
+        if soft_cap is not None:
+            attn = soft_cap * torch.tanh(attn / soft_cap)
+        attn.masked_fill_(mask, float("-inf"))
+        attn = torch.softmax(attn, dim=-1).to(v.dtype)
+        out = torch.einsum("hqk,khd->qhd", attn, v)
+
+        outputs.append(out)
+        start_idx += query_len
+
+    return torch.cat(outputs, dim=0)
+
+
+@pytest.mark.skipif(not current_platform.is_rocm(),
+                    reason="Only ROCm is supported")
+@pytest.mark.parametrize("seq_lens",
+                         [[(10, 1328), (5, 18),
+                           (129, 463)], [(8, 523), (24, 37), (3, 2011)]])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("sliding_window", [None, 256])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("q_dtype", QDTYPES)
+@torch.inference_mode()
+def test_varlen_with_paged_kv(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: Optional[int],
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: Optional[float],
+    num_blocks: int,
+    q_dtype: Optional[torch.dtype],
+) -> None:
+    torch.set_default_device("cuda")
+    current_platform.seed_everything(0)
+    num_seqs = len(seq_lens)
+    query_lens = [x[0] for x in seq_lens]
+    kv_lens = [x[1] for x in seq_lens]
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_query_len = max(query_lens)
+    max_kv_len = max(kv_lens)
+    window_size = ((sliding_window - 1, 0) if sliding_window is not None else
+                   (-1, -1))
+    scale = head_size**-0.5
+
+    query = torch.randn(sum(query_lens),
+                        num_query_heads,
+                        head_size,
+                        dtype=dtype)
+    key_cache = torch.randn(num_blocks,
+                            block_size,
+                            num_kv_heads,
+                            head_size,
+                            dtype=dtype)
+    value_cache = torch.randn_like(key_cache)
+    cu_query_lens = torch.tensor([0] + query_lens,
+                                 dtype=torch.int32).cumsum(dim=0,
+                                                           dtype=torch.int32)
+
+    cu_seq_lens = torch.tensor([0] + kv_lens,
+                               dtype=torch.int32).cumsum(dim=0,
+                                                         dtype=torch.int32)
+    kv_lens = torch.tensor(kv_lens, dtype=torch.int32)
+
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(0,
+                                 num_blocks,
+                                 (num_seqs, max_num_blocks_per_seq),
+                                 dtype=torch.int32)
+
+    output = torch.empty_like(query)
+
+    maybe_quantized_query = query
+    maybe_quantized_key_cache = key_cache
+    maybe_quantized_value_cache = value_cache
+    k_descale = None
+    v_descale = None
+    if q_dtype is not None:
+        # QKV are drawn from N(0, 1): no need for a fp8 scaling factor
+        maybe_quantized_query = query.to(q_dtype)
+        maybe_quantized_key_cache = key_cache.to(q_dtype)
+        maybe_quantized_value_cache = value_cache.to(q_dtype)
+
+        scale_shape = (num_seqs, num_kv_heads)
+        k_descale = torch.ones(scale_shape, dtype=torch.float32)
+        v_descale = torch.ones(scale_shape, dtype=torch.float32)
+
+    torch.ops.vllm.flash_attn_varlen_func(
+        maybe_quantized_query,
+        maybe_quantized_key_cache,
+        maybe_quantized_value_cache,
+        out=output,
+        cu_seqlens_q=cu_query_lens,
+        max_seqlen_q=max_query_len,
+        max_seqlen_k=max_kv_len,
+        softmax_scale=scale,
+        alibi_slopes=None,
+        window_size=window_size,
+        block_table=block_tables,
+        cu_seqlens_k=cu_seq_lens,
+        k_scale=k_descale,
+        v_scale=v_descale,
+    )
+
+    ref_output = ref_paged_attn(
+        query=query,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        query_lens=query_lens,
+        kv_lens=kv_lens,
+        block_tables=block_tables,
+        scale=scale,
+        sliding_window=sliding_window,
+        soft_cap=soft_cap,
+    )
+
+    atol, rtol = 2e-2, 2e-2
+    if q_dtype is not None:
+        atol, rtol = 1.5e-1, 1.5e-1
+    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol), \
+        f"{torch.max(torch.abs(output - ref_output))}"
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index 0739d2596676f..85a5dc8c91c13 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -2,20 +2,21 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with AiterFlashAttention."""
 from dataclasses import dataclass
-from typing import Optional
+from typing import ClassVar, Optional
 
 import torch
 
-from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata, AttentionType,
-                                              is_quantized_kv_cache)
+                                              AttentionMetadata, AttentionType)
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
+                                              CommonAttentionMetadata)
 from vllm.v1.kv_cache_interface import AttentionSpec
 
+_PARTITION_SIZE_ROCM = 256
+
 if current_platform.is_rocm():
     import aiter
 
@@ -32,38 +33,54 @@ if current_platform.is_rocm():
         b_seq_lens_loc,
         block_table,
         block_table_stride_0,
+        k_scale,
+        v_scale,
+        output_dtype: tl.constexpr,
         E_DIM: tl.constexpr,
         BLOCK_SIZE: tl.constexpr,
     ):
         batch_idx = tl.program_id(0)
         block_idx = tl.program_id(1)
-        batch_token_indexes = tl.load(b_seq_lens_loc + batch_idx +
-                                      tl.arange(0, 2))
-        batch_token_start, batch_token_end = tl.split(batch_token_indexes)
-        seq_len = batch_token_end - batch_token_start
 
         batch_query_indexes = tl.load(b_query_lens_loc + batch_idx +
                                       tl.arange(0, 2))
         batch_query_start, batch_query_end = tl.split(batch_query_indexes)
         query_len = batch_query_end - batch_query_start
+
         if query_len <= 1:
             return
+
+        batch_token_indexes = tl.load(b_seq_lens_loc + batch_idx +
+                                      tl.arange(0, 2))
+        batch_token_start, batch_token_end = tl.split(batch_token_indexes)
+        seq_len = batch_token_end - batch_token_start
+
         if block_idx * BLOCK_SIZE < seq_len:
             block_mask = (block_idx * BLOCK_SIZE +
                           tl.arange(0, BLOCK_SIZE)[:, None]) < seq_len
 
             kv_idx = tl.load(block_table + batch_idx * block_table_stride_0 +
-                             block_idx)
+                             block_idx).to(tl.int64)
 
             kv_buffer_off = kv_idx * BLOCK_SIZE * E_DIM + tl.arange(
                 0, BLOCK_SIZE)[:, None] * E_DIM + tl.arange(0, E_DIM)[None, :]
             k_vals = tl.load(k_buffer_ptr + kv_buffer_off,
                              mask=block_mask,
                              other=0.0)
+            if k_vals.dtype.is_fp8():
+                k_vals = (k_vals.to(tl.float32) *
+                          tl.load(k_scale)).to(output_dtype)
+            else:
+                k_vals = k_vals.to(output_dtype)
+
             v_vals = tl.load(v_buffer_ptr + kv_buffer_off,
                              mask=block_mask,
                              other=0.0)
-
+            if v_vals.dtype.is_fp8():
+                v_vals = (v_vals.to(tl.float32) *
+                          tl.load(v_scale)).to(output_dtype)
+            else:
+                v_vals = v_vals.to(output_dtype)
             kv_values_off = batch_token_start * E_DIM + \
                 block_idx * BLOCK_SIZE * E_DIM + \
                 tl.arange(0, BLOCK_SIZE)[:, None] * E_DIM + \
@@ -72,29 +89,44 @@ if current_platform.is_rocm():
             tl.store(v_values_ptr + kv_values_off, v_vals, mask=block_mask)
 
     def vllm_layout_trans(b_query_lens_loc, b_seq_lens_loc, block_table,
-                          k_buffer, v_buffer, max_seq_len, total_tokens):
-        H_KV = v_buffer.shape[2]
-        D = v_buffer.shape[3]
-        BLOCK_SIZE = v_buffer.shape[1]
-        dtype = k_buffer.dtype
-        k_values = torch.empty((total_tokens, H_KV, D),
-                               dtype=dtype,
-                               device="cuda")
-        v_values = torch.empty((total_tokens, H_KV, D),
-                               dtype=dtype,
-                               device="cuda")
+                          k_cache, v_cache, max_seq_len, k_scale, v_scale,
+                          output_dtype, total_tokens):
+        H_KV = v_cache.shape[2]
+        D = v_cache.shape[3]
+        BLOCK_SIZE = v_cache.shape[1]
+
+        k_values = torch.empty(
+            (total_tokens, H_KV, D),
+            dtype=output_dtype,
+            device=k_cache.device,
+        )
+        v_values = torch.empty(
+            (total_tokens, H_KV, D),
+            dtype=output_dtype,
+            device=v_cache.device,
+        )
 
         grid = (block_table.shape[0],
                 (max_seq_len + BLOCK_SIZE - 1) // BLOCK_SIZE)
 
-        _vllm_layout_trans_kernel[grid](k_buffer,
-                                        v_buffer,
+        if output_dtype == torch.float16:
+            output_dtype = tl.float16
+        elif output_dtype == torch.bfloat16:
+            output_dtype = tl.bfloat16
+        else:
+            raise ValueError(f"Unsupported output dtype: {output_dtype}")
+
+        _vllm_layout_trans_kernel[grid](k_cache,
+                                        v_cache,
                                         k_values,
                                         v_values,
                                         b_query_lens_loc,
                                         b_seq_lens_loc,
                                         block_table,
                                         block_table.stride(0),
+                                        k_scale,
+                                        v_scale,
+                                        output_dtype=output_dtype,
                                         E_DIM=H_KV * D,
                                         BLOCK_SIZE=BLOCK_SIZE)
 
@@ -107,16 +139,22 @@ if current_platform.is_rocm():
         out: torch.Tensor,
         cu_seqlens_q: torch.Tensor,
         cu_seqlens_k: torch.Tensor,
-        total_tokens: int,
         max_seqlen_q: int,
         max_seqlen_k: int,
         softmax_scale: float,
         window_size: Optional[list[int]],  # -1 means infinite context window
         alibi_slopes: Optional[list[float]],
         block_table: torch.Tensor,
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
+        total_tokens: int = 0,
     ) -> torch.Tensor:
+        if total_tokens == 0:
+            total_tokens = int(cu_seqlens_k[-1].item())
         k, v = vllm_layout_trans(cu_seqlens_q, cu_seqlens_k, block_table,
-                                 k_cache, v_cache, max_seqlen_k, total_tokens)
+                                 k_cache, v_cache, max_seqlen_k, k_scale,
+                                 v_scale, q.dtype, total_tokens)
+
         output = aiter.flash_attn_varlen_func(
             q=q,
             k=k,
@@ -141,19 +179,21 @@ if current_platform.is_rocm():
         out: torch.Tensor,
         cu_seqlens_q: torch.Tensor,
         cu_seqlens_k: torch.Tensor,
-        total_tokens: int,
         max_seqlen_q: int,
         max_seqlen_k: int,
         softmax_scale: float,
         window_size: Optional[list[int]],  # -1 means infinite context window
         alibi_slopes: Optional[list[float]],
         block_table: torch.Tensor,
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
+        total_tokens: int = 0,
     ) -> torch.Tensor:
         return torch.empty(q.shape[0],
                            q.shape[1],
                            v_cache.shape[-2],
-                           dtype=torch.float8_e4m3fnuz,
-                           device="cuda")
+                           dtype=q.dtype,
+                           device=q.device)
 
     direct_register_custom_op("flash_attn_varlen_func",
                               flash_attn_varlen_func_impl, ["out"],
@@ -163,7 +203,33 @@ if current_platform.is_rocm():
 logger = init_logger(__name__)
 
 
-class AiterFlashAttentionMetadataBuilder:
+@dataclass
+class AiterFlashAttentionMetadata:
+    # NOTE(sang): Definition of context_len, query_len, and seq_len.
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ---------------------|
+    #                                   |-- query_len ---|
+
+    num_actual_tokens: int  # Number of tokens excluding padding.
+    max_query_len: int
+    query_start_loc: torch.Tensor
+    max_seq_len: int
+    seq_lens: torch.Tensor
+    slot_mapping: torch.Tensor
+    block_table: torch.Tensor
+
+    # For cascade attention.
+    use_cascade: bool
+    common_prefix_len: int
+    total_tokens: int
+
+
+class AiterFlashAttentionMetadataBuilder(
+        AttentionMetadataBuilder[AiterFlashAttentionMetadata]):
+    full_cudagraph_supported: ClassVar[bool] = True
 
     def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
                  device: torch.device):
@@ -180,14 +246,23 @@ class AiterFlashAttentionMetadataBuilder:
         self.headdim = self.model_config.get_head_size()
         self.block_size = kv_cache_spec.block_size
         self.kv_cache_spec = kv_cache_spec
-
         # Sliding window size to be used with the AOT scheduler will be
         # populated on first build() call.
         self.aot_sliding_window: Optional[tuple[int, int]] = None
+        self.total_tokens: int = 0
 
     def reorder_batch(self, input_batch, scheduler_output) -> bool:
         return False
 
+    def build_for_cudagraph_capture(
+            self, common_attn_metadata: CommonAttentionMetadata):
+        self.total_tokens = self.model_config.max_model_len \
+            * self.vllm_config.scheduler_config.max_num_partial_prefills
+        res = self.build(common_prefix_len=0,
+                         common_attn_metadata=common_attn_metadata)
+        self.total_tokens = 0
+        return res
+
     def build(self,
               common_prefix_len: int,
               common_attn_metadata: CommonAttentionMetadata,
@@ -195,43 +270,29 @@ class AiterFlashAttentionMetadataBuilder:
 
         num_actual_tokens = common_attn_metadata.num_actual_tokens
         max_query_len = common_attn_metadata.max_query_len
-
         max_seq_len = int(common_attn_metadata.seq_lens_cpu.max())
-        total_tokens = int(common_attn_metadata.seq_lens_cpu.sum())
         query_start_loc = common_attn_metadata.query_start_loc
         seq_lens = common_attn_metadata.seq_lens
         block_table_tensor = common_attn_metadata.block_table_tensor
         slot_mapping = common_attn_metadata.slot_mapping
 
-        cu_seq_lens = torch.zeros(seq_lens.shape[0] + 1,
-                                  dtype=torch.int32,
-                                  device=self.device)
-        torch.cumsum(seq_lens,
-                     dim=0,
-                     dtype=cu_seq_lens.dtype,
-                     out=cu_seq_lens[1:])
+        def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
+                     max_seq_len, causal):
+            return None
 
         use_cascade = common_prefix_len > 0
 
-        cu_prefix_query_lens = None
-        prefix_kv_lens = None
-        suffix_kv_lens = None
-
         attn_metadata = AiterFlashAttentionMetadata(
             num_actual_tokens=num_actual_tokens,
             max_query_len=max_query_len,
             query_start_loc=query_start_loc,
             max_seq_len=max_seq_len,
             seq_lens=seq_lens,
-            cu_seq_lens=cu_seq_lens,
-            total_tokens=total_tokens,
             block_table=block_table_tensor,
             slot_mapping=slot_mapping,
             use_cascade=use_cascade,
             common_prefix_len=common_prefix_len,
-            cu_prefix_query_lens=cu_prefix_query_lens,
-            prefix_kv_lens=prefix_kv_lens,
-            suffix_kv_lens=suffix_kv_lens,
+            total_tokens=self.total_tokens,
         )
         return attn_metadata
 
@@ -254,7 +315,7 @@ class AiterFlashAttentionBackend(AttentionBackend):
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
-        return [32, 64, 96, 128, 160, 192, 224, 256]
+        return [64, 128, 256]
 
     @classmethod
     def validate_head_size(cls, head_size: int) -> None:
@@ -295,34 +356,6 @@ class AiterFlashAttentionBackend(AttentionBackend):
         return (2, num_blocks, block_size, num_kv_heads, head_size)
 
 
-@dataclass
-class AiterFlashAttentionMetadata:
-    # NOTE(sang): Definition of context_len, query_len, and seq_len.
-    # |---------- N-1 iteration --------|
-    # |---------------- N iteration ---------------------|
-    # |- tokenA -|......................|-- newTokens ---|
-    # |---------- context_len ----------|
-    # |-------------------- seq_len ---------------------|
-    #                                   |-- query_len ---|
-
-    num_actual_tokens: int  # Number of tokens excluding padding.
-    max_query_len: int
-    query_start_loc: torch.Tensor
-    max_seq_len: int
-    seq_lens: torch.Tensor
-    cu_seq_lens: torch.Tensor
-    total_tokens: int
-    block_table: torch.Tensor
-    slot_mapping: torch.Tensor
-
-    # For cascade attention.
-    use_cascade: bool
-    common_prefix_len: int
-    cu_prefix_query_lens: Optional[torch.Tensor]
-    prefix_kv_lens: Optional[torch.Tensor]
-    suffix_kv_lens: Optional[torch.Tensor]
-
-
 class AiterFlashAttentionImpl(AttentionImpl):
 
     def __init__(
@@ -366,10 +399,6 @@ class AiterFlashAttentionImpl(AttentionImpl):
                                       "encoder/decoder cross-attention "
                                       "are not implemented for "
                                       "FlashAttentionImpl")
-        if is_quantized_kv_cache(self.kv_cache_dtype):
-            raise NotImplementedError(
-                "AiterFlashAttention does not support fp8 kv-cache on this "
-                "device.")
 
     def forward(
         self,
@@ -440,12 +469,6 @@ class AiterFlashAttentionImpl(AttentionImpl):
         if self.kv_cache_dtype.startswith("fp8"):
             key_cache = key_cache.view(torch.float8_e4m3fnuz)
             value_cache = value_cache.view(torch.float8_e4m3fnuz)
-            num_tokens, num_heads, head_size = query.shape
-            query, _ = ops.scaled_fp8_quant(
-                query.reshape(
-                    (num_tokens, num_heads * head_size)).contiguous(),
-                layer._q_scale)
-            query = query.reshape((num_tokens, num_heads, head_size))
 
         if not attn_metadata.use_cascade:
             cu_seqlens_q = attn_metadata.query_start_loc
@@ -455,8 +478,16 @@ class AiterFlashAttentionImpl(AttentionImpl):
             block_table = attn_metadata.block_table
 
             if max_seqlen_q > 1:
-                cu_seq_lens = attn_metadata.cu_seq_lens
-                total_tokens = attn_metadata.total_tokens
+
+                cu_seq_lens = torch.zeros(seqused_k.shape[0] + 1,
+                                          dtype=torch.int32,
+                                          device=query.device)
+
+                torch.cumsum(seqused_k,
+                             dim=0,
+                             dtype=cu_seq_lens.dtype,
+                             out=cu_seq_lens[1:])
+
                 torch.ops.vllm.flash_attn_varlen_func(
                     query[:num_actual_tokens],
                     key_cache,
@@ -465,29 +496,31 @@ class AiterFlashAttentionImpl(AttentionImpl):
                     cu_seqlens_q=cu_seqlens_q,
                     max_seqlen_q=max_seqlen_q,
                     max_seqlen_k=max_seqlen_k,
-                    total_tokens=total_tokens,
                     softmax_scale=self.scale,
                     alibi_slopes=self.alibi_slopes,
                     window_size=self.sliding_window,
                     block_table=block_table,
-                    cu_seqlens_k=cu_seq_lens)
+                    cu_seqlens_k=cu_seq_lens,
+                    k_scale=layer._k_scale,
+                    v_scale=layer._v_scale,
+                    total_tokens=attn_metadata.total_tokens,
+                )
 
             _, num_heads, head_size = query.shape
-            _PARTITION_SIZE_ROCM = 256
+            nbytes_per_qo_elem = torch.finfo(query.dtype).bits // 8
             num_seqs = seqused_k.shape[0]
-            nbyes_per_qo_elem = torch.finfo(output.dtype).bits // 8
             max_num_partitions = (max_seqlen_k + _PARTITION_SIZE_ROCM -
                                   1) // _PARTITION_SIZE_ROCM
 
             workspace_buffer = torch.empty(
                 (num_seqs * num_heads * max_num_partitions * head_size) *
-                nbyes_per_qo_elem + 2 *
+                nbytes_per_qo_elem + 2 *
                 (num_seqs * num_heads * max_num_partitions) * 4,
                 dtype=torch.uint8,
                 device=output.device,
             )
 
-            aiter.paged_attention_v1(
+            torch.ops.aiter.paged_attention_v1(
                 output[:num_actual_tokens],
                 workspace_buffer,
                 query[:num_actual_tokens],

From 136d750f5f421ca5be2e24b0a913e813d99bb831 Mon Sep 17 00:00:00 2001
From: czhu-cohere <conway.zhu@cohere.com>
Date: Fri, 25 Jul 2025 06:53:21 -0700
Subject: [PATCH 10/57] [Kernel] Improve machete memory bound perf (#21556)

Signed-off-by: czhu-cohere <conway.zhu@cohere.com>
---
 csrc/quantization/machete/machete_prepacked_layout.cuh | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/csrc/quantization/machete/machete_prepacked_layout.cuh b/csrc/quantization/machete/machete_prepacked_layout.cuh
index 81aaa6c4f3a28..4a7d6341e6c00 100644
--- a/csrc/quantization/machete/machete_prepacked_layout.cuh
+++ b/csrc/quantization/machete/machete_prepacked_layout.cuh
@@ -187,8 +187,12 @@ struct PrepackedLayoutBTemplate {
   CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy(
       Shape_NKL shape_mkl) {
     auto layout = TVbNbKL_to_offset(shape_mkl);
-    return make_layout(coalesce(get<0>(layout)), get<1>(layout),
-                       get<2>(layout));
+    // for 4-bit elements, having >= 64 values per column
+    // allows TMA to load full 32-byte sectors
+    auto inner_layout =
+        make_layout(make_shape(_256{}, size<0>(layout) / _256{}));
+
+    return make_layout(inner_layout, get<1>(layout), get<2>(layout));
   }
 
   // ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx)

From e189b50f53e333814d41278c5e5be66240c99018 Mon Sep 17 00:00:00 2001
From: mgazz <michele.gazzetti@gmail.com>
Date: Fri, 25 Jul 2025 15:01:27 +0100
Subject: [PATCH 11/57] Add support for Prithvi in Online serving mode (#21518)

Signed-off-by: Michele Gazzetti <michele.gazzetti1@ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 .../entrypoints/openai/test_skip_tokenizer.py | 93 +++++++++++++++++++
 vllm/engine/multiprocessing/client.py         | 20 ++--
 vllm/entrypoints/openai/serving_engine.py     | 14 ++-
 vllm/entrypoints/openai/serving_pooling.py    |  6 +-
 .../models/prithvi_geospatial_mae.py          |  5 +-
 5 files changed, 128 insertions(+), 10 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_skip_tokenizer.py

diff --git a/tests/entrypoints/openai/test_skip_tokenizer.py b/tests/entrypoints/openai/test_skip_tokenizer.py
new file mode 100644
index 0000000000000..32d28277e0ef8
--- /dev/null
+++ b/tests/entrypoints/openai/test_skip_tokenizer.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import base64
+import io
+
+import numpy as np
+import pytest
+import requests
+import torch
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"
+DTYPE = "float16"
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--task",
+        "embed",
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        DTYPE,
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--skip-tokenizer-init",
+        "--max-num-seqs",
+        "32"
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_request(server: RemoteOpenAIServer, model_name: str):
+
+    pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
+    location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
+
+    buffer_tiff = io.BytesIO()
+    torch.save(pixel_values, buffer_tiff)
+    buffer_tiff.seek(0)
+    binary_data = buffer_tiff.read()
+    base64_tensor_embedding = base64.b64encode(binary_data).decode('utf-8')
+
+    buffer_coord = io.BytesIO()
+    torch.save(location_coords, buffer_coord)
+    buffer_coord.seek(0)
+    binary_data = buffer_coord.read()
+    base64_coord_embedding = base64.b64encode(binary_data).decode('utf-8')
+
+    prompt = {
+        "model":
+        model_name,
+        "additional_data": {
+            "prompt_token_ids": [1]
+        },
+        "encoding_format":
+        "base64",
+        "messages": [{
+            "role":
+            "user",
+            "content": [{
+                "type": "image_embeds",
+                "image_embeds": {
+                    "pixel_values": base64_tensor_embedding,
+                    "location_coords": base64_coord_embedding,
+                },
+            }],
+        }]
+    }
+
+    # test single pooling
+    response = requests.post(server.url_for("pooling"), json=prompt)
+    response.raise_for_status()
+
+    output = response.json()["data"][0]['data']
+
+    np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32)
+
+    assert len(np_response) == 524288
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 67d9a3bf6ce20..cde8fc367fb54 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -97,11 +97,16 @@ class MQLLMEngineClient(EngineClient):
         self.model_config = engine_config.model_config
         self.decoding_config = engine_config.decoding_config
 
-        # Create the tokenizer group.
-        self.tokenizer = init_tokenizer_from_configs(
-            model_config=self.model_config,
-            scheduler_config=engine_config.scheduler_config,
-            lora_config=engine_config.lora_config)
+        if self.vllm_config.model_config.skip_tokenizer_init:
+            self.tokenizer = None
+
+        else:
+            # Create the tokenizer group.
+            self.tokenizer = init_tokenizer_from_configs(
+                model_config=self.model_config,
+                scheduler_config=engine_config.scheduler_config,
+                lora_config=engine_config.lora_config)
+
         self.input_preprocessor = InputPreprocessor(self.model_config,
                                                     self.tokenizer)
 
@@ -375,7 +380,10 @@ class MQLLMEngineClient(EngineClient):
         return self.input_preprocessor
 
     async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None):
-        return await self.tokenizer.get_lora_tokenizer_async(lora_request)
+        if self.tokenizer is None:
+            return None
+        else:
+            return await self.tokenizer.get_lora_tokenizer_async(lora_request)
 
     async def get_vllm_config(self) -> VllmConfig:
         return self.vllm_config
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index edc366f9b8a88..9d848679d5d98 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -880,7 +880,10 @@ class OpenAIServing:
         _chat_template_kwargs.update(chat_template_kwargs or {})
 
         request_prompt: Union[str, list[int]]
-        if isinstance(tokenizer, MistralTokenizer):
+
+        if tokenizer is None:
+            request_prompt = "placeholder"
+        elif isinstance(tokenizer, MistralTokenizer):
             request_prompt = apply_mistral_chat_template(
                 tokenizer,
                 messages=messages,
@@ -910,7 +913,14 @@ class OpenAIServing:
             request = tool_parser(tokenizer).adjust_request(  # type: ignore
                 request=request)
 
-        if isinstance(request_prompt, str):
+        if tokenizer is None:
+            assert isinstance(request_prompt, str), (
+                "Prompt has to be a string", \
+                "when the tokenizer is not initialised"
+            )
+            prompt_inputs = TextTokensPrompt(prompt=request_prompt,
+                                             prompt_token_ids=[1])
+        elif isinstance(request_prompt, str):
             prompt_inputs = await self._tokenize_prompt_input_async(
                 request,
                 tokenizer,
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 12334cdac365a..38745d001ade6 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -96,7 +96,11 @@ class OpenAIServingPooling(OpenAIServing):
                 self.max_model_len, truncate_prompt_tokens)
             lora_request = self._maybe_get_adapters(request)
 
-            tokenizer = await self.engine_client.get_tokenizer(lora_request)
+            if self.model_config.skip_tokenizer_init:
+                tokenizer = None
+            else:
+                tokenizer = await self.engine_client.get_tokenizer(lora_request
+                                                                   )
 
             if isinstance(request, PoolingChatRequest):
                 (
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index 0f00fd47fe4fc..304a9e987ee03 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -103,7 +103,10 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
         mm_kwargs = {}
 
         for k, v in mm_data.items():
-            mm_kwargs[k] = v
+            if isinstance(v, dict) and k == "image":
+                mm_kwargs.update(v)
+            else:
+                mm_kwargs[k] = v
         mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}
 
         # This model receives in input a multi-dimensional tensor representing

From 396ee941803d0382603f567bba41116bb3d04dda Mon Sep 17 00:00:00 2001
From: Kebe <mail@kebe7jun.com>
Date: Fri, 25 Jul 2025 22:33:56 +0800
Subject: [PATCH 12/57] [CI] Unifying Dockerfiles for ARM and X86 Builds
 (#21343)

Signed-off-by: Kebe <mail@kebe7jun.com>
---
 .github/workflows/lint-and-deploy.yaml        |  2 +-
 docker/Dockerfile.arm                         | 62 -------------------
 docker/Dockerfile.cpu                         | 24 ++++++-
 .../installation/cpu/arm.inc.md               |  2 +-
 requirements/cpu.txt                          |  6 +-
 5 files changed, 29 insertions(+), 67 deletions(-)
 delete mode 100644 docker/Dockerfile.arm

diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index 74a7a3a3530f5..d5736c0aee208 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -7,7 +7,7 @@ permissions:
 
 jobs:
   lint-and-deploy:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04-arm
     steps:
       - name: Checkout
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/docker/Dockerfile.arm b/docker/Dockerfile.arm
deleted file mode 100644
index bad093684239c..0000000000000
--- a/docker/Dockerfile.arm
+++ /dev/null
@@ -1,62 +0,0 @@
-# This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform.
-
-FROM ubuntu:22.04 AS cpu-test-arm
-
-ENV CCACHE_DIR=/root/.cache/ccache
-
-ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
-
-RUN --mount=type=cache,target=/var/cache/apt \
-    apt-get update -y \
-    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
-    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
-
-# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects.
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install py-cpuinfo  # Use this to gather CPU info and optimize based on ARM Neoverse cores
-
-# Set LD_PRELOAD for tcmalloc on ARM
-ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
-
-RUN echo 'ulimit -c 0' >> ~/.bashrc
-
-WORKDIR /workspace
-
-ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
-ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
-    pip install --upgrade pip && \
-    pip install -r requirements/build.txt
-
-FROM cpu-test-arm AS build
-
-WORKDIR /workspace/vllm
-
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
-    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
-    pip install -v -r requirements/cpu.txt
-
-COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
-
-# Disabling AVX512 specific optimizations for ARM
-ARG VLLM_CPU_DISABLE_AVX512="true"
-ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
-
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=bind,source=.git,target=.git \
-    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
-    pip install dist/*.whl && \
-    rm -rf dist
-
-WORKDIR /workspace/
-
-RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
-
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
\ No newline at end of file
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 982c1ddf27438..5e49e87131ece 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -1,4 +1,11 @@
-# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
+# This vLLM Dockerfile is used to build images that can run vLLM on both x86_64 and arm64 CPU platforms.
+#
+# Supported platforms:
+#   - linux/amd64 (x86_64)
+#   - linux/arm64 (aarch64)
+#
+# Use the `--platform` option with `docker buildx build` to specify the target architecture, e.g.:
+#   docker buildx build --platform=linux/arm64 -f docker/Dockerfile.cpu .
 #
 # Build targets:
 #   vllm-openai (default): used for serving deployment
@@ -53,7 +60,20 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --upgrade pip && \
     uv pip install -r requirements/cpu.txt
 
-ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD"
+ARG TARGETARCH
+ENV TARGETARCH=${TARGETARCH}
+
+RUN if [ "$TARGETARCH" = "arm64" ]; then \
+        PRELOAD_PATH="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"; \
+    else \
+        PRELOAD_PATH="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so"; \
+    fi && \
+    echo "export LD_PRELOAD=$PRELOAD_PATH" >> ~/.bashrc
+
+# Ensure that the LD_PRELOAD environment variable for export is in effect.
+SHELL ["/bin/bash", "-c"]
+
+ENV LD_PRELOAD=${LD_PRELOAD}
 
 RUN echo 'ulimit -c 0' >> ~/.bashrc
 
diff --git a/docs/getting_started/installation/cpu/arm.inc.md b/docs/getting_started/installation/cpu/arm.inc.md
index 63ae351b395fb..cac578eefb1d7 100644
--- a/docs/getting_started/installation/cpu/arm.inc.md
+++ b/docs/getting_started/installation/cpu/arm.inc.md
@@ -33,7 +33,7 @@ Testing has been conducted on AWS Graviton3 instances for compatibility.
 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]
 ```bash
-docker build -f docker/Dockerfile.arm \
+docker build -f docker/Dockerfile.cpu \
         --tag vllm-cpu-env .
 
 # Launching OpenAI server
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index d80354342bc20..6860275acab6f 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -10,7 +10,8 @@ setuptools>=77.0.3,<80.0.0
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
 torch==2.7.0; platform_system == "Darwin"
-torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
+torch==2.7.0; platform_machine == "ppc64le"
+torch==2.6.0; platform_machine == "aarch64" # for arm64 CPUs, torch 2.7.0 has a issue: https://github.com/vllm-project/vllm/issues/17960
 
 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
 torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
@@ -25,3 +26,6 @@ datasets # for benchmark scripts
 intel-openmp==2024.2.1; platform_machine == "x86_64"
 intel_extension_for_pytorch==2.6.0; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
 triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile.
+
+# Use this to gather CPU info and optimize based on ARM Neoverse cores
+py-cpuinfo; platform_machine == "aarch64"

From 5ac3168ee342f4cae17b0b67375e647bd5dd9151 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Fri, 25 Jul 2025 23:52:42 +0800
Subject: [PATCH 13/57] [Docs] add auto-round quantization readme  (#21600)

Signed-off-by: Wenhua Cheng <wenhua.cheng@intel.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/features/quantization/README.md     |   1 +
 docs/features/quantization/auto_round.md | 103 +++++++++++++++++++++++
 2 files changed, 104 insertions(+)
 create mode 100644 docs/features/quantization/auto_round.md

diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md
index e8c3b11230786..e18c128f30fc9 100644
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@@ -6,6 +6,7 @@ Contents:
 
 - [Supported Hardware](supported_hardware.md)
 - [AutoAWQ](auto_awq.md)
+- [AutoRound](auto_round.md)
 - [BitsAndBytes](bnb.md)
 - [BitBLAS](bitblas.md)
 - [GGUF](gguf.md)
diff --git a/docs/features/quantization/auto_round.md b/docs/features/quantization/auto_round.md
new file mode 100644
index 0000000000000..2dfd847bb7d9a
--- /dev/null
+++ b/docs/features/quantization/auto_round.md
@@ -0,0 +1,103 @@
+# AutoRound
+
+[AutoRound](https://github.com/intel/auto-round) is Intel’s advanced quantization algorithm designed to produce highly efficient **INT2, INT3, INT4, and INT8**
+quantized large language models—striking an optimal balance between accuracy and deployment performance.
+
+AutoRound applies weight-only quantization to transformer-based models, enabling significant memory savings and faster
+inference while maintaining near-original accuracy. It supports a wide range of hardware platforms, including **CPUs,
+Intel GPUs, HPUs, and CUDA-enabled devices**.
+
+Please refer to the [AutoRound guide](https://github.com/intel/auto-round/blob/main/docs/step_by_step.md) for more details.
+
+Key Features:
+
+✅ **AutoRound, AutoAWQ, AutoGPTQ, and GGUF** are supported
+
+✅ **10+ vision-language models (VLMs)** are supported
+
+✅ **Per-layer mixed-bit quantization** for fine-grained control
+
+✅ **RTN (Round-To-Nearest) mode** for quick quantization with slight accuracy loss
+
+✅ **Multiple quantization recipes**: best, base, and light
+
+✅ Advanced utilities such as immediate packing and support for **10+ backends**
+
+## Installation
+
+```bash
+uv pip install auto-round
+```
+
+## Quantizing a model
+
+For VLMs, please change to `auto-round-mllm` in CLI usage and `AutoRoundMLLM` in API usage.
+
+### CLI usage
+
+```bash
+auto-round \
+    --model Qwen/Qwen3-0.6B \
+    --bits 4 \
+    --group_size 128 \
+    --format "auto_round" \
+    --output_dir ./tmp_autoround
+```
+
+```bash
+auto-round \
+    --model Qwen/Qwen3-0.6B \
+    --format "gguf:q4_k_m" \
+    --output_dir ./tmp_autoround
+```
+
+### API usage
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from auto_round import AutoRound
+
+model_name = "Qwen/Qwen3-0.6B"
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+bits, group_size, sym = 4, 128, True
+autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym)
+
+# the best accuracy, 4-5X slower, low_gpu_mem_usage could save ~20G but ~30% slower
+# autoround = AutoRound(model, tokenizer, nsamples=512, iters=1000, low_gpu_mem_usage=True, bits=bits, group_size=group_size, sym=sym)
+
+# 2-3X speedup, slight accuracy drop at W4G128
+# autoround = AutoRound(model, tokenizer, nsamples=128, iters=50, lr=5e-3, bits=bits, group_size=group_size, sym=sym )
+
+output_dir = "./tmp_autoround"
+# format= 'auto_round'(default), 'auto_gptq', 'auto_awq'
+autoround.quantize_and_save(output_dir, format="auto_round")
+```
+
+## Running a quantized model with vLLM
+
+Here is some example code to run auto-round format in vLLM:
+
+```python
+from vllm import LLM, SamplingParams
+
+prompts = [
+    "Hello, my name is",
+]
+sampling_params = SamplingParams(temperature=0.6, top_p=0.95)
+model_name = "Intel/DeepSeek-R1-0528-Qwen3-8B-int4-AutoRound"
+llm = LLM(model=model_name)
+
+outputs = llm.generate(prompts, sampling_params)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+# Acknowledgement
+
+Special thanks to open-source low precision libraries such as AutoGPTQ, AutoAWQ, GPTQModel, Triton, Marlin, and
+ExLLaMAV2 for providing low-precision CUDA kernels, which are leveraged in AutoRound.

From 7cfea0df390c154c1026f77d3682e2733ca4aca8 Mon Sep 17 00:00:00 2001
From: QiliangCui <derrhein@gmail.com>
Date: Fri, 25 Jul 2025 13:22:01 -0700
Subject: [PATCH 14/57] [TPU][Test] Rollback PR-21550. (#21619)

Signed-off-by: Qiliang Cui <derrhein@gmail.com>
---
 tests/v1/tpu/test_basic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
index dd89059ded524..865b58bc7f4b0 100644
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -59,7 +59,7 @@ def test_basic(
                 # actually test chunked prompt
                 max_num_batched_tokens=1024,
                 max_model_len=8192,
-                gpu_memory_utilization=0.95,
+                gpu_memory_utilization=0.7,
                 max_num_seqs=max_num_seqs,
                 tensor_parallel_size=tensor_parallel_size) as vllm_model:
             vllm_outputs = vllm_model.generate_greedy(example_prompts,

From 41d3082c416897092bc924bc341e86b3e49728ee Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 25 Jul 2025 17:06:48 -0700
Subject: [PATCH 15/57] Add Unsloth to RLHF.md (#21636)

---
 docs/training/rlhf.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/docs/training/rlhf.md b/docs/training/rlhf.md
index 4f75e4e01495c..f608a630ab7a5 100644
--- a/docs/training/rlhf.md
+++ b/docs/training/rlhf.md
@@ -2,10 +2,14 @@
 
 Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors.
 
-vLLM can be used to generate the completions for RLHF. The best way to do this is with libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) and [verl](https://github.com/volcengine/verl).
+vLLM can be used to generate the completions for RLHF. Some ways to do this include using libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF), [verl](https://github.com/volcengine/verl) and [unsloth](https://github.com/unslothai/unsloth).
 
 See the following basic examples to get started if you don't want to use an existing library:
 
 - [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](../examples/offline_inference/rlhf.md)
 - [Training and inference processes are colocated on the same GPUs using Ray](../examples/offline_inference/rlhf_colocate.md)
 - [Utilities for performing RLHF with vLLM](../examples/offline_inference/rlhf_utils.md)
+
+See the following notebooks showing how to use vLLM for GRPO:
+
+- [Qwen-3 4B GRPO using Unsloth + vLLM](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_(4B)-GRPO.ipynb)

From 75d29cf4e1d7e950c2308b12e944b507fb3e1916 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 25 Jul 2025 20:07:07 -0400
Subject: [PATCH 16/57] [Perf] Cuda Kernel for Int8 Per Token Group Quant
 (#21476)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 csrc/ops.h                                            |  5 +++++
 .../compressed_tensors/int8_quant_kernels.cu          | 10 ++++++++++
 csrc/quantization/fp8/per_token_group_quant.cu        |  6 +++++-
 csrc/quantization/per_token_group_quant_8bit.h        | 10 ++++++++++
 csrc/torch_bindings.cpp                               |  8 ++++++++
 .../layers/quantization/utils/int8_utils.py           | 11 +++++++++--
 6 files changed, 47 insertions(+), 3 deletions(-)
 create mode 100644 csrc/quantization/per_token_group_quant_8bit.h

diff --git a/csrc/ops.h b/csrc/ops.h
index 97a247d9d628c..207291eceb169 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -292,6 +292,11 @@ void per_token_group_quant_fp8(const torch::Tensor& input,
                                torch::Tensor& output_q, torch::Tensor& output_s,
                                int64_t group_size, double eps, double fp8_min,
                                double fp8_max, bool scale_ue8m0);
+
+void per_token_group_quant_int8(const torch::Tensor& input,
+                                torch::Tensor& output_q,
+                                torch::Tensor& output_s, int64_t group_size,
+                                double eps, double int8_min, double int8_max);
 #endif
 
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
index 5cd2ac179768b..6a81f159f46ae 100644
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@@ -1,6 +1,8 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/all.h>
 
+#include "../per_token_group_quant_8bit.h"
+
 #include <cmath>
 
 #include "../../dispatch_utils.h"
@@ -336,3 +338,11 @@ void dynamic_scaled_int8_quant(
         }
       });
 }
+
+void per_token_group_quant_int8(const torch::Tensor& input,
+                                torch::Tensor& output_q,
+                                torch::Tensor& output_s, int64_t group_size,
+                                double eps, double int8_min, double int8_max) {
+  per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
+                             int8_min, int8_max);
+}
\ No newline at end of file
diff --git a/csrc/quantization/fp8/per_token_group_quant.cu b/csrc/quantization/fp8/per_token_group_quant.cu
index afc41faeca902..2609054f2072b 100644
--- a/csrc/quantization/fp8/per_token_group_quant.cu
+++ b/csrc/quantization/fp8/per_token_group_quant.cu
@@ -1,6 +1,8 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/util/Float8_e4m3fn.h>
 
+#include "../per_token_group_quant_8bit.h"
+
 #include <cmath>
 
 #include <cuda_fp16.h>
@@ -120,7 +122,7 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
                                 torch::Tensor& output_q,
                                 torch::Tensor& output_s, int64_t group_size,
                                 double eps, double min_8bit, double max_8bit,
-                                bool scale_ue8m0 = false) {
+                                bool scale_ue8m0) {
   TORCH_CHECK(input.is_contiguous());
   TORCH_CHECK(output_q.is_contiguous());
 
@@ -198,6 +200,8 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
       input.scalar_type(), "per_token_group_quant_8bit", ([&] {
         if (dst_type == at::ScalarType::Float8_e4m3fn) {
           LAUNCH_KERNEL(scalar_t, c10::Float8_e4m3fn);
+        } else if (dst_type == at::ScalarType::Char) {
+          LAUNCH_KERNEL(scalar_t, int8_t);
         }
       }));
 
diff --git a/csrc/quantization/per_token_group_quant_8bit.h b/csrc/quantization/per_token_group_quant_8bit.h
new file mode 100644
index 0000000000000..537b61bc4303f
--- /dev/null
+++ b/csrc/quantization/per_token_group_quant_8bit.h
@@ -0,0 +1,10 @@
+#pragma once
+#include <torch/all.h>
+
+// TODO(wentao): refactor the folder to 8bit, then includes fp8 and int8 folders
+// 8-bit per-token-group quantization helper used by both FP8 and INT8
+void per_token_group_quant_8bit(const torch::Tensor& input,
+                                torch::Tensor& output_q,
+                                torch::Tensor& output_s, int64_t group_size,
+                                double eps, double min_8bit, double max_8bit,
+                                bool scale_ue8m0 = false);
\ No newline at end of file
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 95f8541bc9e2d..85b6abef00b03 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -624,6 +624,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("per_token_group_fp8_quant", torch::kCUDA,
            &per_token_group_quant_fp8);
 
+  // Compute per-token-group INT8 quantized tensor and scaling factor.
+  ops.def(
+      "per_token_group_quant_int8(Tensor input, Tensor! output_q, Tensor! "
+      "output_s, int group_size, float eps, float int8_min, float int8_max) -> "
+      "()");
+  ops.impl("per_token_group_quant_int8", torch::kCUDA,
+           &per_token_group_quant_int8);
+
   // reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel
   ops.def(
       "rearrange_kn_weight_as_n32k16_order(Tensor b_qweight, Tensor b_scales, "
diff --git a/vllm/model_executor/layers/quantization/utils/int8_utils.py b/vllm/model_executor/layers/quantization/utils/int8_utils.py
index 1fdf7d174e25e..6840cabbf1ae3 100644
--- a/vllm/model_executor/layers/quantization/utils/int8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py
@@ -238,13 +238,20 @@ def per_token_group_quant_int8(
     int8_min = iinfo.min
 
     x_q = torch.empty_like(x, device=x.device, dtype=dtype)
-    M = x.numel() // group_size
-    N = group_size
     x_s = torch.empty(
         x.shape[:-1] + (x.shape[-1] // group_size, ),
         device=x.device,
         dtype=torch.float32,
     )
+    # prefer CUDA kernel if available
+    if current_platform.is_cuda():
+        torch.ops._C.per_token_group_quant_int8(x, x_q, x_s, group_size, eps,
+                                                float(int8_min),
+                                                float(int8_max))
+        return x_q, x_s
+
+    M = x.numel() // group_size
+    N = group_size
 
     BLOCK = triton.next_power_of_2(N)
     # heuristics for number of warps

From 2eddd437ba5e7ce80d7341bf87a3078802b01ba7 Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Fri, 25 Jul 2025 17:07:26 -0700
Subject: [PATCH 17/57] Add interleaved RoPE test for Llama4 (Maverick)
 (#21478)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
---
 .../multimodal/generation/test_maverick.py    | 92 +++++++++++++++----
 1 file changed, 73 insertions(+), 19 deletions(-)

diff --git a/tests/models/multimodal/generation/test_maverick.py b/tests/models/multimodal/generation/test_maverick.py
index 306cf39002df2..bacc9ef94f49d 100644
--- a/tests/models/multimodal/generation/test_maverick.py
+++ b/tests/models/multimodal/generation/test_maverick.py
@@ -22,6 +22,9 @@ from transformers import (AutoConfig, AutoProcessor, AutoTokenizer,
                           GenerationConfig)
 
 from vllm import LLM, SamplingParams
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
+                                        FullAttentionSpec)
 
 from ....utils import multi_gpu_test
 
@@ -69,6 +72,26 @@ def run_maverick_serving(model: str):
         raise
 
 
+def get_rope_layers_config(model_path: str) -> list[int]:
+    """
+    Get the interleaved RoPE configuration from HuggingFace config
+
+    Args:
+        model_path: Path to the local directory containing the reduced
+            Maverick model checkpoint
+
+    Returns:
+        List of 0 or 1 indicating whether each layer uses RoPE and local attn
+        0 indicates that RoPE is not used while 1 indicates that RoPE is used.
+    """
+    config_path = Path(model_path) / "config.json"
+    model_config = json.loads(config_path.read_text())
+    text_config = model_config["text_config"]
+    no_rope_layers = text_config["no_rope_layers"]
+    print(f"Found no_rope_layers: {no_rope_layers}")
+    return no_rope_layers
+
+
 def create_reduced_maverick_model(
     original_model_name:
     str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
@@ -113,7 +136,6 @@ def create_reduced_maverick_model(
         print("Loading original model configuration...")
         original_config = AutoConfig.from_pretrained(original_model_name,
                                                      trust_remote_code=True)
-
         print("Creating reduced configuration...")
         reduced_config = create_reduced_config(original_config, text_layers,
                                                num_experts, vision_layers)
@@ -510,21 +532,32 @@ def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
           f"{index_data['metadata']['total_size'] / (1024**3):.2f} GB")
 
 
-def run_reduced_model(model_path: str,
-                      should_profile: bool = False,
-                      **kwargs) -> None:
-    """Test the created reduced model with vLLM."""
-
-    print(f"\nTesting reduced model at {model_path}...")
-
-    llm = LLM(
-        model=model_path,
-        trust_remote_code=True,
-        max_model_len=512,  # Small context for testing
-        gpu_memory_utilization=0.3,  # Conservative memory usage
-        **kwargs,
+def check_attention_spec_interleaved_rope(
+    llm: LLM,
+    num_attention_layers: int,
+    num_ranks: int,
+    rope_layers: list[int],
+):
+    """Check that the attention spec is correct."""
+    assert isinstance(llm.llm_engine.model_executor, Executor)
+    kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs(
     )
+    for rank in range(num_ranks):
+        kv_cache_specs = kv_cache_specs_per_rank[rank]
+        assert len(kv_cache_specs.keys()) == num_attention_layers
+        for i in range(num_attention_layers):
+            if rope_layers[i] == 0:
+                expected_spec = FullAttentionSpec
+            else:
+                expected_spec = ChunkedLocalAttentionSpec
+            assert isinstance(
+                kv_cache_specs[
+                    f"language_model.model.layers.{i}.self_attn.attn"],
+                expected_spec)
 
+
+def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
+    """Test the created reduced model with vLLM."""
     sampling_params = SamplingParams(temperature=0.8,
                                      top_p=0.95,
                                      max_tokens=50)
@@ -551,6 +584,7 @@ def run_reduced_model(model_path: str,
 @pytest.mark.parametrize("tp,ep", [(2, True)])
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 def test_dummy_maverick(
+    monkeypatch,
     original_model_name: str,
     text_layers: int,
     num_experts: int,
@@ -562,6 +596,10 @@ def test_dummy_maverick(
     force_recreate: bool = True,
     profile: bool = False,
 ) -> None:
+    # Disable multiprocessing allows us to access model executor from LLM engine
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+
     model_path = create_reduced_maverick_model(
         original_model_name=original_model_name,
         output_dir=output_dir,
@@ -573,11 +611,27 @@ def test_dummy_maverick(
 
     print(f"\nReduced model created successfully at: {model_path}")
 
-    run_reduced_model(model_path=model_path,
-                      should_profile=profile,
-                      enforce_eager=enforce_eager,
-                      tensor_parallel_size=tp,
-                      enable_expert_parallel=ep)
+    rope_layers = get_rope_layers_config(model_path)
+
+    llm = LLM(
+        model=model_path,
+        trust_remote_code=True,
+        max_model_len=512,  # Small context for testing
+        gpu_memory_utilization=0.3,  # Conservative memory usage
+        enforce_eager=enforce_eager,
+        tensor_parallel_size=tp,
+        enable_expert_parallel=ep,
+    )
+
+    check_attention_spec_interleaved_rope(
+        llm,
+        text_layers,
+        tp,
+        rope_layers,
+    )
+
+    print(f"\nTesting reduced model at {model_path}...")
+    run_reduced_model(llm=llm, should_profile=profile)
 
 
 def main():

From cea96a015678c86789fa86a719ce7d6d176d78fd Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Fri, 25 Jul 2025 17:07:58 -0700
Subject: [PATCH 18/57] [Bugfix] Fix sync_and_slice_intermediate_tensors
 (#21537)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 5fe594db667a5..6ddb2c422dff7 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1270,7 +1270,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         if sync_self:
             assert intermediate_tensors is not None
             for k, v in intermediate_tensors.items():
-                is_scattered = "residual" and is_residual_scattered
+                is_scattered = k == "residual" and is_residual_scattered
                 copy_len = num_tokens // tp if is_scattered else \
                     num_tokens
                 self.intermediate_tensors[k][:copy_len].copy_(

From c7742d61134783b50098ab249f6815051a4c4a2a Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Fri, 25 Jul 2025 17:08:30 -0700
Subject: [PATCH 19/57] [Bugfix] Always set RAY_ADDRESS for Ray actor before
 spawn (#21540)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 vllm/utils/__init__.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 9f4140ac64e2f..054037b8932b7 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -2883,26 +2883,27 @@ def _maybe_force_spawn():
     if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") == "spawn":
         return
 
-    reason = None
-    if cuda_is_initialized():
-        reason = "CUDA is initialized"
-    elif xpu_is_initialized():
-        reason = "XPU is initialized"
-    elif is_in_ray_actor():
+    reasons = []
+    if is_in_ray_actor():
         # even if we choose to spawn, we need to pass the ray address
         # to the subprocess so that it knows how to connect to the ray cluster.
         # env vars are inherited by subprocesses, even if we use spawn.
         import ray
         os.environ["RAY_ADDRESS"] = ray.get_runtime_context().gcs_address
-        reason = "In a Ray actor and can only be spawned"
+        reasons.append("In a Ray actor and can only be spawned")
 
-    if reason is not None:
+    if cuda_is_initialized():
+        reasons.append("CUDA is initialized")
+    elif xpu_is_initialized():
+        reasons.append("XPU is initialized")
+
+    if reasons:
         logger.warning(
             "We must use the `spawn` multiprocessing start method. "
             "Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
             "See https://docs.vllm.ai/en/latest/usage/"
             "troubleshooting.html#python-multiprocessing "
-            "for more information. Reason: %s", reason)
+            "for more information. Reasons: %s", "; ".join(reasons))
         os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
 

From f1b286b2fbbde18745d57b0ce7ac4fbc56f10f0d Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Fri, 25 Jul 2025 17:09:00 -0700
Subject: [PATCH 20/57] [TPU] Update ptxla nightly version to 20250724 (#21555)

Signed-off-by: Chengji Yao <chengjiyao@google.com>
---
 docker/Dockerfile.tpu | 2 +-
 requirements/tpu.txt  | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docker/Dockerfile.tpu b/docker/Dockerfile.tpu
index 3474ff50de7bd..b9fc9def88190 100644
--- a/docker/Dockerfile.tpu
+++ b/docker/Dockerfile.tpu
@@ -1,4 +1,4 @@
-ARG NIGHTLY_DATE="20250714"
+ARG NIGHTLY_DATE="20250724"
 ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.12_tpuvm_$NIGHTLY_DATE"
 
 FROM $BASE_IMAGE
diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index d86f643d388ba..2d0d8bd8457e3 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -19,8 +19,8 @@ nixl==0.3.0
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch==2.9.0.dev20250716
-torchvision==0.24.0.dev20250716
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp312-cp312-linux_x86_64.whl ; python_version == "3.12"
+torch==2.9.0.dev20250724
+torchvision==0.24.0.dev20250724
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250724-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250724-cp312-cp312-linux_x86_64.whl ; python_version == "3.12"
 

From 7ae75fa6d02afc45637b060ce7a535a1bd547afd Mon Sep 17 00:00:00 2001
From: Alex Kogan <82225080+sakogan@users.noreply.github.com>
Date: Fri, 25 Jul 2025 21:09:34 -0400
Subject: [PATCH 21/57] [Feature] Add support for MoE models in the
 calibration-free RTN-based quantization (#20766)

Signed-off-by: Alex Kogan <alex.kogan@oracle.com>
---
 tests/quantization/test_rtn.py                |   5 +-
 .../model_executor/layers/quantization/rtn.py | 234 +++++++++++++++---
 2 files changed, 201 insertions(+), 38 deletions(-)

diff --git a/tests/quantization/test_rtn.py b/tests/quantization/test_rtn.py
index 133b2d9e4df69..bc2b468f97d8c 100644
--- a/tests/quantization/test_rtn.py
+++ b/tests/quantization/test_rtn.py
@@ -8,7 +8,10 @@ import pytest
 
 from tests.quantization.utils import is_quant_method_supported
 
-MODELS = ["microsoft/Phi-3-mini-4k-instruct"]
+MODELS = [
+    "microsoft/Phi-3-mini-4k-instruct",  # dense model
+    "ai21labs/Jamba-tiny-dev",  # MoE model
+]
 
 
 @pytest.mark.skipif(not is_quant_method_supported("rtn"),
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
index 68309716cf901..cceaf9857c40f 100644
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py
@@ -3,18 +3,19 @@
 # Copyright © 2025, Oracle and/or its affiliates.
 
 import os
-from typing import Any, Optional
+from typing import Any, Callable, Optional
 
 import torch
 import torch.nn.functional as F
 from torch.nn.parameter import Parameter
 
 from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+    QuantizationConfig, QuantizeMethodBase)
 
 logger = init_logger(__name__)
 """By default, use 8 bit as target precision, but it can be 
@@ -71,9 +72,11 @@ class RTNConfig(QuantizationConfig):
         return cls(weight_bits, group_size)
 
     def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["RTNLinearMethod"]:
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
         if isinstance(layer, LinearBase):
             return RTNLinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return RTNMoEMethod(self)
         return None
 
 
@@ -94,11 +97,18 @@ class RTNTensor:
             self.data.narrow(dim, start // factor, length // factor),
             self.scale.narrow(dim, start, length), self.quant_config)
 
+    def __getitem__(self, key):
+        return RTNTensor(self.data[key], self.scale[key], self.quant_config)
+
     @property
     def shape(self):
         shape = self.data.shape
         factor = 1 if self.quant_config.weight_bits == 8 else 2
-        return torch.Size((shape[0] * factor, shape[1]))
+        batch_present = len(shape) == 3
+        if batch_present:
+            return torch.Size((shape[0], shape[1] * factor, shape[2]))
+        else:
+            return torch.Size((shape[0] * factor, shape[1]))
 
     def copy_(self, loaded_weight: torch.Tensor) -> None:
         qweight, weight_scale = rtn_quantize(loaded_weight.cuda(),
@@ -165,7 +175,7 @@ class RTNLinearMethod(LinearMethodBase):
         weight = RTNParameter(data=torch.empty(output_size_per_partition //
                                                factor,
                                                input_size_per_partition,
-                                               dtype=torch.int8),
+                                               dtype=torch.uint8),
                               scale=scale,
                               quant_config=self.quant_config)
 
@@ -180,18 +190,7 @@ class RTNLinearMethod(LinearMethodBase):
         layer.output_size_per_partition = output_size_per_partition
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        """torch.compile does not know how to deal with a Parameter subclass
-        (aka RTNParameter). As we don't really need RTNParameters for the
-        forward pass, we replace them with equivalent instances of Parameters.
-        """
-        old_weight = layer.weight
-        assert isinstance(old_weight, RTNParameter)
-        data = old_weight.data.data
-
-        delattr(layer, "weight")
-
-        new_weight = Parameter(data=data, requires_grad=False)
-        layer.register_parameter("weight", new_weight)
+        fix_weights(layer, "weight")
 
     def apply(self,
               layer: torch.nn.Module,
@@ -209,6 +208,128 @@ class RTNLinearMethod(LinearMethodBase):
         return out
 
 
+class RTNMoEMethod(FusedMoEMethodBase):
+
+    def __init__(self, quant_config: RTNConfig):
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        factor = 1 if self.quant_config.weight_bits == 8 else 2
+
+        # Fused gate_up_proj (column parallel)
+        num_groups_per_col = (hidden_size // self.quant_config.group_size
+                              if self.quant_config.group_size != -1 else 1)
+        w13_scale = Parameter(
+            torch.empty(num_experts,
+                        2 * intermediate_size_per_partition,
+                        num_groups_per_col,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_scale", w13_scale)
+
+        w13_weight = RTNParameter(data=torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition // factor,
+            hidden_size,
+            dtype=torch.uint8),
+                                  scale=w13_scale,
+                                  quant_config=self.quant_config)
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        # down_proj (row parallel)
+        num_groups_per_col = (intermediate_size_per_partition //
+                              self.quant_config.group_size
+                              if self.quant_config.group_size != -1 else 1)
+        w2_scale = Parameter(torch.zeros(num_experts,
+                                         hidden_size,
+                                         num_groups_per_col,
+                                         dtype=params_dtype),
+                             requires_grad=False)
+        layer.register_parameter("w2_scale", w2_scale)
+
+        w2_weight = RTNParameter(data=torch.empty(
+            num_experts,
+            hidden_size // factor,
+            intermediate_size_per_partition,
+            dtype=torch.uint8),
+                                 scale=w2_scale,
+                                 quant_config=self.quant_config)
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        weight_bits = self.quant_config.weight_bits
+        fix_weights(layer, "w13_weight", weight_bits == 4)
+        fix_weights(layer, "w2_weight", weight_bits == 4)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if enable_eplb:
+            raise NotImplementedError(
+                "EPLB not supported for `RTNMoEMethod` yet.")
+
+        from vllm.model_executor.layers.fused_moe import fused_experts
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+
+        weight_bits = self.quant_config.weight_bits
+        group_size = self.quant_config.group_size
+
+        ret = fused_experts(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            activation=activation,
+            use_int4_w4a16=weight_bits == 4,
+            use_int8_w8a16=weight_bits == 8,
+            global_num_experts=global_num_experts,
+            w1_scale=layer.w13_scale,
+            w2_scale=layer.w2_scale,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            expert_map=expert_map,
+            block_shape=[0, group_size])
+
+        return ret
+
+
 def rtn_quantize(tensor: torch.Tensor, num_bits: int,
                  group_size: int) -> tuple[torch.Tensor, torch.Tensor]:
     """Quantize a tensor using per-group static scaling factor.
@@ -221,34 +342,44 @@ def rtn_quantize(tensor: torch.Tensor, num_bits: int,
                     If equal to -1, each row in the input tensor is treated
                     as one group.
     """
+    batch_present = len(tensor.shape) == 3
+    if not batch_present:
+        tensor = tensor.unsqueeze(0)
 
     q_range = 2**num_bits
-    num_groups = (tensor.shape[0] * tensor.shape[1] //
-                  group_size if group_size != -1 else tensor.shape[0])
+    num_groups = (tensor.shape[1] * tensor.shape[2] //
+                  group_size if group_size != -1 else tensor.shape[1])
     """Calculate a scaling factor per input group.
     """
-    input_flat = tensor.reshape(num_groups, -1)
-    input_min = torch.min(input_flat, dim=1, keepdim=True)[0]
-    input_max = torch.max(input_flat, dim=1, keepdim=True)[0]
+    input_flat = tensor.reshape(tensor.shape[0], num_groups, -1)
+    input_min = torch.min(input_flat, dim=2, keepdim=True)[0]
+    input_max = torch.max(input_flat, dim=2, keepdim=True)[0]
     input_max_abs = torch.max(input_min.abs(), input_max.abs())
     scale = (input_max_abs * 2.0 / (q_range - 1))
-    """Scale each input group, truncate and round to the nearest integer.
+    """Scale each input group, round to the nearest integer, shift 
+    the range and truncate.
     """
     scaled_input = input_flat / scale
-    scaled_input = scaled_input.clamp(-q_range // 2, q_range // 2 - 1)
     scaled_input = scaled_input.round()
+    scaled_input += q_range // 2
+    scaled_input = scaled_input.clamp(0, q_range - 1)
 
-    scale = scale.reshape(tensor.shape[0], -1).contiguous()
-    inputs_q = scaled_input.reshape(tensor.shape).to(torch.int8)
+    scale = scale.reshape(tensor.shape[0], tensor.shape[1], -1).contiguous()
+    inputs_q = scaled_input.reshape(tensor.shape).to(torch.uint8)
     inputs_q = inputs_q.contiguous()
 
     if num_bits == 4:
         """Pack two 4-bit values into each byte.
         """
-        inputs_q = (inputs_q[:, 1::2] << 4) | (inputs_q[:, ::2] & 0xf)
-        inputs_q = inputs_q.reshape(tensor.shape[0] // 2, tensor.shape[1])
+        inputs_q = (inputs_q[:, :, 1::2] << 4) | (inputs_q[:, :, ::2] & 0xf)
+        inputs_q = inputs_q.reshape(tensor.shape[0], tensor.shape[1] // 2,
+                                    tensor.shape[2])
         inputs_q = inputs_q.contiguous()
 
+    if not batch_present:
+        inputs_q = inputs_q.squeeze(0)
+        scale = scale.squeeze(0)
+
     return inputs_q, scale
 
 
@@ -259,31 +390,60 @@ def rtn_dequantize(tensor: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
         tensor: The input tensor.
         scale: The tensor with per-group scale factors.
     """
+    batch_present = len(tensor.shape) == 3
+    if not batch_present:
+        tensor = tensor.unsqueeze(0)
+        scale = scale.unsqueeze(0)
 
-    num_groups = scale.size(0) * scale.size(1)
-    input_dim, output_dim = tensor.shape
+    num_groups = scale.size(1) * scale.size(2)
+    batch, input_dim, output_dim = tensor.shape
 
-    num_bits = 8 if input_dim == scale.size(0) else 4
+    num_bits = 8 if input_dim == scale.size(1) else 4
+    q_range = 2**num_bits
     if num_bits == 4:
         input_dim *= 2
 
-    data = torch.empty((input_dim, output_dim),
+    data = torch.empty((batch, input_dim, output_dim),
                        dtype=scale.dtype,
                        device=tensor.device)
 
     if num_bits == 8:
         data.copy_(tensor)
+        data -= q_range // 2
     else:
         """Unpack two 4-bit values from each byte.
         """
-        tensor = tensor.reshape(input_dim, output_dim // 2)
+        tensor = tensor.reshape(batch, input_dim, output_dim // 2)
         for i in range(2):
-            data[:, i::2] = (tensor << 4 * (1 - i)) >> 4
+            data[:, :, i::2] = ((tensor << 4 *
+                                 (1 - i)) >> 4).to(torch.int8) - q_range // 2
     """Scale each input group with its scaling factor.
     """
-    scale = scale.reshape(num_groups, -1)
-    data = data.reshape(num_groups, -1)
+    scale = scale.reshape(batch, num_groups, -1)
+    data = data.reshape(batch, num_groups, -1)
     data = torch.mul(data, scale)
 
-    input_deq = data.reshape((input_dim, output_dim)).contiguous()
+    input_deq = data.reshape((batch, input_dim, output_dim)).contiguous()
+    if not batch_present:
+        input_deq = input_deq.squeeze(0)
+
     return input_deq
+
+
+def fix_weights(layer: torch.nn.Module,
+                param_name: str,
+                reshape: bool = False):
+    """torch.compile does not know how to deal with a Parameter subclass
+    (aka RTNParameter). As we don't really need RTNParameters for the
+    forward pass, we replace them with equivalent instances of Parameters.
+    """
+    old_weight = getattr(layer, param_name)
+    assert isinstance(old_weight, RTNParameter)
+    data = old_weight.data.data
+
+    delattr(layer, param_name)
+
+    if reshape:
+        data = data.reshape(old_weight.shape[0], old_weight.shape[1] * 2, -1)
+    new_weight = Parameter(data=data, requires_grad=False)
+    layer.register_parameter(param_name, new_weight)

From 62965de5fe8be8e3622952a9b5cda86973cf9c51 Mon Sep 17 00:00:00 2001
From: Farzad Abdolhosseini <farzad.abdolhosseini@gmail.com>
Date: Sat, 26 Jul 2025 04:12:31 +0300
Subject: [PATCH 22/57] [Model] Ultravox: Support Llama 4 and Gemma 3 backends
 (#17818)

Signed-off-by: Farzad Abdolhosseini <farzad@fixie.ai>
Signed-off-by: Patrick Li <patrick8289@gmail.com>
Co-authored-by: Patrick Li <patrick8289@gmail.com>
---
 tests/models/registry.py                    |  2 ++
 vllm/model_executor/models/registry.py      |  1 +
 vllm/model_executor/models/ultravox.py      | 38 +++++++++++++--------
 vllm/transformers_utils/configs/ultravox.py | 22 +++++++-----
 4 files changed, 39 insertions(+), 24 deletions(-)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 1800262ced67f..b41e432d738a7 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -221,6 +221,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                                 "fp8": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"}),  # noqa: E501
     "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
                                         is_available_online=False),
+    "Llama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
+                                         is_available_online=False),
     "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
     "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"),
     "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"),  # noqa: E501
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 14a8ac7876f73..9b204fdcbe1a5 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -89,6 +89,7 @@ _TEXT_GENERATION_MODELS = {
     "JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
     "JambaForCausalLM": ("jamba", "JambaForCausalLM"),
     "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
+    "Llama4ForCausalLM": ("llama4", "Llama4ForCausalLM"),  # noqa: E501
     # For decapoda-research/llama-*
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
     "MambaForCausalLM": ("mamba", "MambaForCausalLM"),
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 3697e3fd0cf43..a4569ccd5a845 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -39,9 +39,7 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     merge_multimodal_embeddings,
                     merge_multimodal_embeddings_from_map)
 
-_AUDIO_PLACEHOLDER_OVERRIDE = "<|reserved_special_token_0|>"
-_AUDIO_PLACEHOLDER_TOKEN = 128002
-_AUDIO_TOKENS_PER_SECOND = 6.25
+_AUDIO_PLACEHOLDER_OVERRIDE = "<|audio|>"
 _MAX_ENCODER_BATCH_SIZE = 16
 
 
@@ -80,14 +78,15 @@ class UltravoxProcessingInfo(BaseProcessingInfo):
         sampling_rate: Optional[int] = None,
         **kwargs: object,
     ) -> ProcessorMixin:
+        config = self.ctx.model_config.hf_config
         hf_processor = self.ctx.get_hf_processor(**kwargs)
 
         # NOTE: Ultravox processing definition uses '<|eot_id|>' as the
         # placeholder that will cause confusion with the actual end of turn
-        # token, thus we override placeholder with a reserved special
-        # token.
+        # token, thus we override placeholder with a reserved token.
         hf_processor.audio_token_replacement = _AUDIO_PLACEHOLDER_OVERRIDE
-        hf_processor.audio_replacement_token_id = _AUDIO_PLACEHOLDER_TOKEN
+        hf_processor.audio_replacement_token_id = config.audio_token_index
+
         return hf_processor
 
     def get_feature_extractor(
@@ -274,7 +273,7 @@ class UltravoxProjector(nn.Module):
         else:
             self.act = get_act_fn(config.projector_act)
 
-        dim_out = config.text_config.hidden_size
+        dim_out = config.text_hidden_size
         self.linear_2 = nn.Linear(dim_mid, dim_out, bias=False)
 
         # Ultravox v0.4.1 and below use layer_norm after the second linear layer
@@ -572,9 +571,14 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
         input_ids: torch.Tensor,
         multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
+        # The audio token index is not included in the embedding table
+        # We need to remove it before embedding lookup
+        safe_input_ids = input_ids.clone()
+        safe_input_ids[safe_input_ids == self.config.audio_token_index] = 0
+        inputs_embeds = self.language_model.get_input_embeddings(
+            safe_input_ids)
+        if multimodal_embeddings is not None and len(
+                multimodal_embeddings) > 0:
 
             # TODO(ywang96): remove this block after v0 is deprecated.
             if not envs.VLLM_USE_V1:
@@ -585,7 +589,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
             else:
                 inputs_embeds = merge_multimodal_embeddings(
                     input_ids, inputs_embeds, multimodal_embeddings,
-                    _AUDIO_PLACEHOLDER_TOKEN)
+                    self.config.audio_token_index)
         return inputs_embeds
 
     def forward(self,
@@ -623,10 +627,14 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
                                                       multimodal_embeddings)
             input_ids = None
 
-        hidden_states = self.language_model.model(input_ids,
-                                                  positions,
-                                                  intermediate_tensors,
-                                                  inputs_embeds=inputs_embeds)
+        language_model = self.language_model
+        if hasattr(language_model, "language_model"):
+            language_model = language_model.language_model
+
+        hidden_states = language_model.model(input_ids,
+                                             positions,
+                                             intermediate_tensors,
+                                             inputs_embeds=inputs_embeds)
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py
index 62f63b02d49a4..87064cc12deda 100644
--- a/vllm/transformers_utils/configs/ultravox.py
+++ b/vllm/transformers_utils/configs/ultravox.py
@@ -45,6 +45,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
     """
 
     model_type = "ultravox"
+    audio_token = "<|audio|>"
     is_composition = False
 
     def __init__(
@@ -80,29 +81,32 @@ class UltravoxConfig(transformers.PretrainedConfig):
             # Avoid circular import
             from vllm.transformers_utils.config import get_config
 
-            self.text_config = get_config(text_model_id,
-                                          trust_remote_code=False)
+            text_config_obj = get_config(text_model_id,
+                                         trust_remote_code=False)
         else:
             text_config = text_config or {}
-            self.text_config = transformers.CONFIG_MAPPING[text_config.get(
+            text_config_obj = transformers.CONFIG_MAPPING[text_config.get(
                 "model_type", "llama")](**text_config)
 
+        inner_text_config = text_config_obj.get_text_config()
+
         if audio_model_id is not None:
             # Avoid circular import
             from vllm.transformers_utils.config import get_config
 
-            self.audio_config = get_config(audio_model_id,
-                                           trust_remote_code=False)
+            audio_config = get_config(audio_model_id, trust_remote_code=False)
         else:
             audio_config = audio_config or {}
-            self.audio_config = transformers.CONFIG_MAPPING[audio_config.get(
+            audio_config = transformers.CONFIG_MAPPING[audio_config.get(
                 "model_type", "whisper")](**audio_config)
 
+        self.text_config = text_config_obj
+        self.audio_config = audio_config
         self.text_model_lora_config = text_model_lora_config or {}
         self.audio_model_lora_config = audio_model_lora_config or {}
 
-        self.vocab_size = self.text_config.vocab_size
-
-        self.initializer_range = self.text_config.initializer_range
+        self.vocab_size = inner_text_config.vocab_size
+        self.initializer_range = inner_text_config.initializer_range
+        self.text_hidden_size = inner_text_config.hidden_size
 
         super().__init__(**kwargs)

From 97349fe2bc68de69550787135c1a8c6b85fc8d81 Mon Sep 17 00:00:00 2001
From: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Date: Sat, 26 Jul 2025 09:37:32 +0800
Subject: [PATCH 23/57] [Docs] add offline serving multi-modal video input
 expamle Qwen2.5-VL (#21530)

Signed-off-by: David Chen <530634352@qq.com>
---
 docs/features/multimodal_inputs.md | 64 ++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index e820ace4f8fe7..e83dfdb11dadc 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -177,6 +177,70 @@ Multi-image input can be extended to perform video captioning. We show this with
 You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
 instead of using multi-image input.
 
+Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown in this example using Qwen2.5-VL:
+
+??? code
+
+    ```python
+    from transformers import AutoProcessor
+    from vllm import LLM, SamplingParams
+    from qwen_vl_utils import process_vision_info
+
+    model_path = "Qwen/Qwen2.5-VL-3B-Instruct/"
+    video_path = "https://content.pexels.com/videos/free-videos.mp4"
+
+    llm = LLM(
+        model=model_path,
+        gpu_memory_utilization=0.8,
+        enforce_eager=True,
+        limit_mm_per_prompt={"video": 1},
+    )
+
+    sampling_params = SamplingParams(
+        max_tokens=1024,
+    )
+
+    video_messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": [
+                {"type": "text", "text": "describe this video."},
+                {
+                    "type": "video",
+                    "video": video_path,
+                    "total_pixels": 20480 * 28 * 28,
+                    "min_pixels": 16 * 28 * 28
+                }
+            ]
+        },
+    ]
+
+    messages = video_messages
+    processor = AutoProcessor.from_pretrained(model_path)
+    prompt = processor.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+
+    image_inputs, video_inputs = process_vision_info(messages)
+    mm_data = {}
+    if video_inputs is not None:
+        mm_data["video"] = video_inputs
+
+    llm_inputs = {
+        "prompt": prompt,
+        "multi_modal_data": mm_data,
+    }
+
+    outputs = llm.generate([llm_inputs], sampling_params=sampling_params)
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    ```
+
+    !!! note
+        'process_vision_info' is only applicable to Qwen2.5-VL and similar models.
+
 Full example: <gh-file:examples/offline_inference/vision_language.py>
 
 ### Audio Inputs

From a55c95096b3537edfbbb7a5eafae0b0475c5ef07 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Fri, 25 Jul 2025 19:06:21 -0700
Subject: [PATCH 24/57] Correctly kill vLLM processes after finishing serving
 benchmarks (#21641)

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 .../scripts/run-nightly-benchmarks.sh              | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
index 4d01a314adc47..4162905bb3cc3 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -95,12 +95,14 @@ json2args() {
 }
 
 kill_gpu_processes() {
-  pkill -f python
-  pkill -f python3
-  pkill -f tritonserver
-  pkill -f pt_main_thread
-  pkill -f text-generation
-  pkill -f lmdeploy
+  pkill -f '[p]ython'
+  pkill -f '[p]ython3'
+  pkill -f '[t]ritonserver'
+  pkill -f '[p]t_main_thread'
+  pkill -f '[t]ext-generation'
+  pkill -f '[l]mdeploy'
+  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
+  pkill -f '[V]LLM'
 
   while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
     sleep 1

From 2f6e6b33fb0acf5331982c8e1ea620005e3a19ba Mon Sep 17 00:00:00 2001
From: Alexandre JUAN <alexandre.juan@epitech.eu>
Date: Sat, 26 Jul 2025 05:11:10 +0200
Subject: [PATCH 25/57] [Bugfix] Fix isinstance check for tensor types in
 _load_prompt_embeds to use dtype comparison (#21612)

Signed-off-by: Alexandre Juan <a.juan@netheos.net>
---
 vllm/entrypoints/openai/serving_engine.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 9d848679d5d98..71976fea1ee77 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -957,9 +957,11 @@ class OpenAIServing:
         def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt:
             tensor = torch.load(io.BytesIO(base64.b64decode(embed)),
                                 weights_only=True)
-            assert isinstance(
-                tensor,
-                (torch.FloatTensor, torch.BFloat16Tensor, torch.HalfTensor))
+            assert isinstance(tensor, torch.Tensor) and tensor.dtype in (
+                torch.float32,
+                torch.bfloat16,
+                torch.float16,
+            )
             if tensor.dim() > 2:
                 tensor = tensor.squeeze(0)
                 assert tensor.dim() == 2

From 7728dd77bb802e1876012eb264df4d2fa2fc6f3c Mon Sep 17 00:00:00 2001
From: QiliangCui <derrhein@gmail.com>
Date: Fri, 25 Jul 2025 23:20:30 -0700
Subject: [PATCH 26/57] [TPU][Test] Divide TPU v1 Test into 2 parts. (#21431)

---
 .../hardware_ci/run-tpu-v1-test-part2.sh      | 166 ++++++++++++++++++
 .../scripts/hardware_ci/run-tpu-v1-test.sh    |  12 --
 2 files changed, 166 insertions(+), 12 deletions(-)
 create mode 100755 .buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh

diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
new file mode 100755
index 0000000000000..d998c1f73b514
--- /dev/null
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -0,0 +1,166 @@
+#!/bin/bash
+
+set -xu
+
+
+remove_docker_container() { 
+    docker rm -f tpu-test || true; 
+    docker rm -f vllm-tpu || true;
+}
+
+trap remove_docker_container EXIT
+
+# Remove the container that might not be cleaned up in the previous run.
+remove_docker_container
+
+# Build the docker image.
+docker build -f docker/Dockerfile.tpu -t vllm-tpu .
+
+# Set up cleanup.
+cleanup_docker() {
+  # Get Docker's root directory
+  docker_root=$(docker info -f '{{.DockerRootDir}}')
+  if [ -z "$docker_root" ]; then
+    echo "Failed to determine Docker root directory."
+    exit 1
+  fi
+  echo "Docker root directory: $docker_root"
+  # Check disk usage of the filesystem where Docker's root directory is located
+  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+  # Define the threshold
+  threshold=70
+  if [ "$disk_usage" -gt "$threshold" ]; then
+    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+    # Remove dangling images (those that are not tagged and not used by any container)
+    docker image prune -f
+    # Remove unused volumes / force the system prune for old images as well.
+    docker volume prune -f && docker system prune --force --filter "until=72h" --all
+    echo "Docker images and volumes cleanup completed."
+  else
+    echo "Disk usage is below $threshold%. No cleanup needed."
+  fi
+}
+cleanup_docker
+
+# For HF_TOKEN.
+source /etc/environment
+
+docker run --privileged --net host --shm-size=16G -it \
+    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
+    vllm-tpu /bin/bash -c '
+set -e # Exit immediately if a command exits with a non-zero status.
+set -u # Treat unset variables as an error.
+
+echo "--- Starting script inside Docker container ---"
+
+# Create results directory
+RESULTS_DIR=$(mktemp -d)
+# If mktemp fails, set -e will cause the script to exit.
+echo "Results will be stored in: $RESULTS_DIR"
+
+# Install dependencies
+echo "--- Installing Python dependencies ---"
+python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
+    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
+    && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
+    && python3 -m pip install --progress-bar off hf-transfer
+echo "--- Python dependencies installed ---"
+export VLLM_USE_V1=1
+export VLLM_XLA_CHECK_RECOMPILATION=1
+export VLLM_XLA_CACHE_PATH=
+echo "Using VLLM V1"
+
+echo "--- Hardware Information ---"
+# tpu-info
+echo "--- Starting Tests ---"
+set +e
+overall_script_exit_code=0
+
+# --- Test Definitions ---
+# If a test fails, this function will print logs and will not cause the main script to exit.
+run_test() {
+    local test_num=$1
+    local test_name=$2
+    local test_command=$3
+    local log_file="$RESULTS_DIR/test_${test_num}.log"
+    local actual_exit_code
+
+    echo "--- TEST_$test_num: Running $test_name ---"
+    
+    # Execute the test command.
+    eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
+    actual_exit_code=$?
+
+    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
+    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
+
+    if [ "$actual_exit_code" -ne 0 ]; then
+        echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
+        echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
+        if [ -f "$log_file" ]; then
+            cat "$log_file" >&2
+        else
+            echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
+        fi
+        echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
+        return "$actual_exit_code" # Return the failure code
+    else
+        echo "TEST_$test_num ($test_name) PASSED."
+        return 0 # Return success
+    fi
+}
+
+# Helper function to call run_test and update the overall script exit code
+run_and_track_test() {
+    local test_num_arg="$1"
+    local test_name_arg="$2"
+    local test_command_arg="$3"
+
+    # Run the test
+    run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
+    local test_specific_exit_code=$?
+
+    # If the test failed, set the overall script exit code to 1
+    if [ "$test_specific_exit_code" -ne 0 ]; then
+        # No need for extra echo here, run_test already logged the failure.
+        overall_script_exit_code=1
+    fi
+}
+
+# --- Actual Test Execution ---
+run_and_track_test 1 "test_struct_output_generate.py" \
+    "HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
+run_and_track_test 2 "test_moe_pallas.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
+run_and_track_test 3 "test_lora.py" \
+    "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
+run_and_track_test 4 "test_tpu_qkv_linear.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
+run_and_track_test 5 "test_spmd_model_weight_loading.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
+run_and_track_test 6 "test_kv_cache_update_kernel.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
+
+# After all tests have been attempted, exit with the overall status.
+if [ "$overall_script_exit_code" -ne 0 ]; then
+    echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
+else
+    echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
+fi
+exit "$overall_script_exit_code"
+' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
+
+# Capture the exit code of the docker run command
+DOCKER_RUN_EXIT_CODE=$?
+
+# The trap will run for cleanup.
+# Exit the main script with the Docker run command's exit code.
+if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
+    echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
+    exit "$DOCKER_RUN_EXIT_CODE"
+else
+    echo "Docker run command completed successfully."
+    exit 0
+fi
+# TODO: This test fails because it uses RANDOM_SEED sampling
+# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index 5514d7770cff8..e565d4b246945 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -150,18 +150,6 @@ run_and_track_test 9 "test_multimodal.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
 run_and_track_test 10 "test_pallas.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
-run_and_track_test 11 "test_struct_output_generate.py" \
-    "HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
-run_and_track_test 12 "test_moe_pallas.py" \
-    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
-run_and_track_test 13 "test_lora.py" \
-    "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
-run_and_track_test 14 "test_tpu_qkv_linear.py" \
-    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
-run_and_track_test 15 "test_spmd_model_weight_loading.py" \
-    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
-run_and_track_test 16 "test_kv_cache_update_kernel.py" \
-    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
 
 # After all tests have been attempted, exit with the overall status.
 if [ "$overall_script_exit_code" -ne 0 ]; then

From 875af38e01217f20827f0b4e1353b91c884b9d53 Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Sat, 26 Jul 2025 19:14:04 +0800
Subject: [PATCH 27/57] Support Intern-S1 (#21628)

Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 docs/models/supported_models.md               |   1 +
 examples/offline_inference/vision_language.py |  32 +
 .../vision_language_multi_image.py            |  28 +
 tests/models/registry.py                      |   2 +
 vllm/model_executor/models/interns1.py        | 711 ++++++++++++++++++
 vllm/model_executor/models/interns1_vit.py    | 421 +++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 7 files changed, 1196 insertions(+)
 create mode 100644 vllm/model_executor/models/interns1.py
 create mode 100644 vllm/model_executor/models/interns1_vit.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 0f3b730eabedc..3847fc15119fd 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -593,6 +593,7 @@ Specified using `--task generate`.
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
+| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> | `internlm/Intern-S1`, etc. | | ✅︎ | ✅︎ |
 | `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
 | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index eb6b410848558..61f5525c6d7e7 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -468,6 +468,37 @@ def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# Intern-S1
+def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "internlm/Intern-S1"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={modality: 1},
+        enforce_eager=True,
+    )
+
+    placeholder = "<IMG_CONTEXT>"
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    messages = [
+        [{"role": "user", "content": f"{placeholder}\n{question}"}]
+        for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # InternVL
 def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
     model_name = "OpenGVLab/InternVL3-2B"
@@ -1303,6 +1334,7 @@ model_example_map = {
     "h2ovl_chat": run_h2ovl,
     "hyperclovax_seed_vision": run_hyperclovax_seed_vision,
     "idefics3": run_idefics3,
+    "interns1": run_interns1,
     "internvl_chat": run_internvl,
     "nemotron_vl": run_nemotron_vl,
     "keye_vl": run_keye_vl,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 2e14fc807e104..e312a0953e9be 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -253,6 +253,33 @@ def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "internlm/Intern-S1"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = "\n".join(
+        f"Image-{i}: <IMG_CONTEXT>\n" for i, _ in enumerate(image_urls, start=1)
+    )
+    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "OpenGVLab/InternVL2-2B"
 
@@ -946,6 +973,7 @@ model_example_map = {
     "gemma3": load_gemma3,
     "h2ovl_chat": load_h2ovl,
     "idefics3": load_idefics3,
+    "interns1": load_interns1,
     "internvl_chat": load_internvl,
     "hyperclovax_seed_vision": load_hyperclovax_seed_vision,
     "keye_vl": load_keye_vl,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index b41e432d738a7..0dc5aec8db12e 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -381,6 +381,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                          extras={"2B": "OpenGVLab/InternVL2-2B",
                                                  "3.0": "OpenGVLab/InternVL3-1B"},  # noqa: E501
                                          trust_remote_code=True),
+    "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1",
+                                         trust_remote_code=True),
     "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
                                                         {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}),  # noqa: E501
     "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py
new file mode 100644
index 0000000000000..36204e4c5953f
--- /dev/null
+++ b/vllm/model_executor/models/interns1.py
@@ -0,0 +1,711 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# --------------------------------------------------------
+# InternS1
+# Copyright (c) 2025 Shanghai AI Lab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Literal, Optional, TypedDict, Union
+
+import torch
+import torch.nn as nn
+from transformers import InternVLProcessor, PretrainedConfig
+from transformers.activations import ACT2FN
+from transformers.models.got_ocr2.image_processing_got_ocr2_fast import (
+    GotOcr2ImageProcessorFast)
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.models.interns1_vit import InternS1VisionModel
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   ImageSize, MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+
+class InternS1MultiModalProjector(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size *
+                                       int(1 / config.downsample_ratio)**2)
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size *
+            int(1 / config.downsample_ratio)**2,
+            config.text_config.hidden_size)
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = nn.Linear(config.text_config.hidden_size,
+                                  config.text_config.hidden_size)
+
+    def forward(self, image_features):
+        hidden_states = self.layer_norm(image_features)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class InternS1ImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+    """
+    Shape:
+    `(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
+    """
+
+
+class InternS1ImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
+    or a list of tensors of shape `(total_image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+InternS1ImageInputs = Union[InternS1ImagePixelInputs,
+                            InternS1ImageEmbeddingInputs]
+
+
+class InternS1VideoPixelInputs(TypedDict):
+    type: Literal["pixel_values_videos"]
+    pixel_values: torch.Tensor
+    """
+    Shape:
+    `(batch_size * num_video * num_frames, num_channels, height, width)`
+    """
+
+    num_patches: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
+
+class InternS1VideoEmbeddingInputs(TypedDict):
+    type: Literal["video_embeds"]
+    data: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A tensor of shape `(num_videos, total_video_feature_size, hidden_size)`
+    or a list of tensors of shape `(total_video_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+InternS1VideoInputs = Union[InternS1VideoPixelInputs,
+                            InternS1VideoEmbeddingInputs]
+
+
+def resolve_interns1_min_max_num(
+    min_dynamic_patch: int,
+    max_dynamic_patch: int,
+    dynamic_image_size: bool,
+    use_thumbnail: bool,
+) -> tuple[int, int]:
+    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
+    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
+
+    if use_thumbnail and max_dynamic_patch != 1:
+        max_dynamic_patch += 1
+
+    return min_dynamic_patch, max_dynamic_patch
+
+
+def get_interns1_target_ratios(
+    min_num: int,
+    max_num: int,
+) -> list[tuple[int, int]]:
+    target_ratios = {(i, j)
+                     for n in range(min_num, max_num + 1)
+                     for i in range(1, n + 1)
+                     for j in range(1, n + 1) if min_num <= i * j <= max_num}
+    return sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+
+class InternS1ProcessingInfo(BaseProcessingInfo):
+    """Basic image-only ProcessingInfo for InternS1-style models."""
+
+    def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
+        return self.ctx.get_hf_processor(InternVLProcessor, **kwargs)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional['GotOcr2ImageProcessorFast'] = None,
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor().image_processor
+
+        if not isinstance(processor, GotOcr2ImageProcessorFast):
+            raise ValueError(f'GotOcr2ImageProcessorFast is expected but got '
+                             f'{type(processor)}')
+        num_image_patches = processor.get_number_of_image_tokens(
+            image_height, image_width, images_kwargs=dict())
+        num_image_tokens = self.get_hf_processor(
+        ).image_seq_length * num_image_patches
+        return num_image_tokens
+
+    def resolve_target_ratios(self, use_thumbnail: Optional[bool] = None):
+        image_processor = self.get_hf_processor().image_processor
+        min_dynamic_patch = image_processor.min_patches
+        max_dynamic_patch = image_processor.max_patches
+        # HF format's InternVL processor uses `crop_to_patches` which is
+        # equivalent to `use_thumbnail` in original format.
+        use_thumbnail = image_processor.crop_to_patches
+        dynamic_image_size = True
+        min_num, max_num = resolve_interns1_min_max_num(
+            min_dynamic_patch,
+            max_dynamic_patch,
+            dynamic_image_size,
+            use_thumbnail=use_thumbnail)
+
+        return get_interns1_target_ratios(min_num, max_num)
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        processor = self.get_hf_processor()
+
+        hf_config = self.ctx.get_hf_config()
+        base_height, base_width = hf_config.vision_config.image_size
+        target_ratios = self.resolve_target_ratios()
+
+        largest_feature_size, largest_feature_pinpoint = 0, None
+        for wr, hr in target_ratios:
+            width, height = base_width * wr, base_height * hr
+
+            feat_size = self.get_num_image_tokens(
+                image_width=width,
+                image_height=height,
+                processor=processor.image_processor,
+            )
+            if feat_size > largest_feature_size:
+                largest_feature_size = feat_size
+                largest_feature_pinpoint = ImageSize(width=width,
+                                                     height=height)
+
+        assert not (largest_feature_size == 0 or largest_feature_pinpoint
+                    is None), ("Cannot have a largest feature size of 0!")
+
+        return largest_feature_pinpoint
+
+    def get_max_image_tokens(self) -> int:
+        processor = self.get_hf_processor()
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            processor=processor.image_processor,
+        )
+
+
+class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo]
+                                 ):
+    """Basic image-only DummyInputsBuilder for InternS1-style models."""
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        image_token = self.info.get_hf_processor().image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        num_images = mm_counts.get("image", 0)
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+
+class InternS1MultiModalProcessor(
+        BaseMultiModalProcessor[InternS1ProcessingInfo]):
+    """ Basic image-only MultiModalProcessor for InternS1-style models."""
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> Mapping[str, NestedTensors]:
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
+        image_token_id = hf_processor.image_token_id
+
+        # Since there may be extra tokens in the feature placeholders,
+        # we need to pass the image token ID to the model to select the
+        # tokens to merge from the vision encoder outputs
+        processed_outputs["image_token_id"] = torch.tensor(image_token_id)
+        images = mm_data.get('images', None)
+        image_processor = self.info.get_hf_processor().image_processor
+        if images is not None:
+            image_inputs = image_processor(images=images)
+            image_num_patches = image_inputs.pop("num_patches")
+            if not isinstance(image_num_patches, list):
+                raise ValueError(
+                    f'num_patches is supposed to be list, but got '
+                    f'{type(image_num_patches)}')
+            image_num_patches = torch.tensor(image_num_patches)
+            processed_outputs['image_num_patches'] = image_num_patches
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: Mapping[str, NestedTensors],
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+
+        image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0))
+        num_images = len(image_num_patches)
+
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", image_num_patches),
+            image_num_patches=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+            image_token_id=MultiModalFieldConfig.shared("image", num_images),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        img_context_token = hf_processor.image_token
+        start_image_token = hf_processor.start_image_token
+        end_image_token = hf_processor.end_image_token
+
+        def get_replacement(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                feature_size = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                feature_size = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    processor=hf_processor.image_processor,
+                )
+
+            repl_features = img_context_token * feature_size
+            repl_full = start_image_token + repl_features + end_image_token
+            return PromptUpdateDetails.select_text(repl_full,
+                                                   img_context_token)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=img_context_token,
+                replacement=get_replacement,
+            )
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    InternS1MultiModalProcessor,
+    info=InternS1ProcessingInfo,
+    dummy_inputs=InternS1DummyInputsBuilder)
+class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                       SupportsPP, SupportsLoRA):
+
+    # To ensure correct weight loading and mapping.
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "lm_head.": "language_model.lm_head.",
+            "model.language_model.": "language_model.model.",
+            "model.vision_tower.": "vision_tower.",
+            "model.multi_modal_projector.": "multi_modal_projector.",
+        })
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        # transformers InternVLProcessor uses <IMG_CONTEXT> as the seperator
+        # refer to https://github.com/huggingface/transformers/blob/f90de364c2484c7c325bbe05befdcf487bd75b63/src/transformers/models/internvl/processing_internvl.py#L116
+        if modality.startswith("image"):
+            return '<IMG_CONTEXT>'
+        if modality.startswith("video"):
+            return "<video>"
+
+        raise ValueError("Only image or video modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        image_size = config.vision_config.image_size[0]
+        patch_size = config.vision_config.patch_size[0]
+        self.patch_size = patch_size
+        self.num_image_token = int(
+            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+        self.downsample_ratio = config.downsample_ratio
+
+        self.llm_arch_name = config.text_config.architectures[0]
+        self.vision_tower = self._init_vision_model(
+            config,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "vision_tower"),
+        )
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        self.multi_modal_projector = self._init_mlp1(config)
+
+        self.img_context_token_id = None
+        self.video_context_token_id = None
+
+        self.visual_token_mask = None
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _init_vision_model(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        *,
+        prefix: str,
+    ):
+        num_hidden_layers = config.vision_config.num_hidden_layers
+        return InternS1VisionModel(
+            config.vision_config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers,
+            prefix=prefix,
+        )
+
+    def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential:
+        return InternS1MultiModalProjector(config)
+
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
+                   int(c / (scale_factor * scale_factor)))
+        x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+
+    def extract_feature(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        vit_embeds = self.vision_tower(pixel_values=pixel_values)
+        vit_embeds = vit_embeds[:, 1:, :]
+
+        h = w = int(vit_embeds.shape[1]**0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.pixel_shuffle(vit_embeds,
+                                        scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1,
+                                        vit_embeds.shape[-1])
+
+        vit_embeds = self.multi_modal_projector(vit_embeds)
+        return vit_embeds
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+
+        h, w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape)
+
+            if actual_dims != expected_dims:
+                expected_expr = str(expected_dims)
+                raise ValueError(
+                    "The expected shape of pixel values per image per batch "
+                    f" per patch is {expected_expr}. "
+                    f"You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[InternS1ImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_num_patches = kwargs.pop("image_num_patches", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return InternS1ImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds),
+            )
+
+        image_token_id = kwargs["image_token_id"]
+        assert isinstance(image_token_id, torch.Tensor)
+        self.img_context_token_id = image_token_id.flatten().unique().item()
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            if not isinstance(image_num_patches, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image_num_patches. "
+                                 f"Got type: {type(image_num_patches)}")
+
+            pixel_values = flatten_bn(pixel_values, concat=True)
+            image_num_patches = flatten_bn(image_num_patches, concat=True)
+
+            return InternS1ImagePixelInputs(
+                type="pixel_values",
+                pixel_values=self._validate_pixel_values(pixel_values),
+                num_patches=image_num_patches,
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _parse_and_validate_video_input(
+            self, **kwargs: object) -> Optional[InternS1VideoPixelInputs]:
+        pixel_values_flat_video = kwargs.pop("pixel_values_flat_video", None)
+        video_num_patches = kwargs.pop("video_num_patches", None)
+        video_embeds = kwargs.pop("video_embeds", None)
+
+        if pixel_values_flat_video is None and video_embeds is None:
+            return None
+
+        if video_embeds is not None:
+            if not isinstance(video_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of video embeddings. "
+                                 f"Got type: {type(video_embeds)}")
+
+            return InternS1ImageEmbeddingInputs(
+                type="video_embeds",
+                data=flatten_bn(video_embeds),
+            )
+
+        video_token_id = kwargs["video_token_id"]
+        assert isinstance(video_token_id, torch.Tensor)
+        self.video_context_token_id = video_token_id.flatten().unique().item()
+
+        if pixel_values_flat_video is not None:
+            if not isinstance(pixel_values_flat_video, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values_flat_video)}")
+
+            if not isinstance(video_num_patches, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image_num_patches. "
+                                 f"Got type: {type(video_num_patches)}")
+
+            pixel_values_flat_video = flatten_bn(pixel_values_flat_video,
+                                                 concat=True)
+            video_num_patches = flatten_bn(video_num_patches, concat=True)
+
+            return InternS1VideoPixelInputs(
+                type="pixel_values_videos",
+                pixel_values=self._validate_pixel_values(
+                    pixel_values_flat_video),
+                num_patches=video_num_patches,
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_image_input(
+        self,
+        image_input: Union[InternS1ImageInputs, InternS1VideoPixelInputs],
+    ) -> tuple[torch.Tensor, ...]:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        assert self.vision_tower is not None
+
+        image_embeds = self.extract_feature(image_input["pixel_values"])
+
+        num_patches = image_input["num_patches"]
+
+        # Only one image in the current batch
+        if len(num_patches) == 1:
+            return (image_embeds.view(-1,
+                                      self.config.text_config.hidden_size), )
+
+        # NOTE: Image embeddings are split into separate tensors for each image
+        # by the size of each embedding.
+        feature_size = image_embeds.shape[1]
+        image_embeds = image_embeds.view(-1,
+                                         self.config.text_config.hidden_size)
+        image_feature_sizes = [
+            num_patches * feature_size for num_patches in num_patches
+        ]
+        return image_embeds.split(image_feature_sizes)
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values",
+                             "image_embeds") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if input_key in ("pixel_values_flat_video",
+                             ) and "videos" not in modalities:
+                modalities["videos"] = self._parse_and_validate_video_input(
+                    **kwargs)
+
+        return modalities
+
+    def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
+        self.visual_token_mask = None
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return []
+            return None
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += vision_embeddings
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_embeddings = self._process_image_input(video_input)
+                multimodal_embeddings += video_embeddings
+
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            context_token_ids = [
+                token_id for token_id in (self.img_context_token_id,
+                                          self.video_context_token_id)
+                if token_id is not None
+            ]
+            assert len(context_token_ids) >= 1
+            self._set_visual_token_mask(input_ids)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                context_token_ids,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> IntermediateTensors:
+
+        if intermediate_tensors is not None:
+            input_ids = None
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        forward_kwargs = {
+            "input_ids": input_ids,
+            "positions": positions,
+            "intermediate_tensors": intermediate_tensors,
+            "inputs_embeds": inputs_embeds,
+        }
+
+        hidden_states = self.language_model.model(**forward_kwargs)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="multi_modal_projector",
+            tower_model="vision_tower")
diff --git a/vllm/model_executor/models/interns1_vit.py b/vllm/model_executor/models/interns1_vit.py
new file mode 100644
index 0000000000000..300ed17ecaabc
--- /dev/null
+++ b/vllm/model_executor/models/interns1_vit.py
@@ -0,0 +1,421 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig
+from transformers.utils import torch_int
+
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+
+NORM2FN = {
+    'rms_norm': RMSNorm,
+    'layer_norm': nn.LayerNorm,
+}
+
+
+class InternS1VisionPatchEmbeddings(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] //
+                                                          patch_size[0])
+        patch_shape = (image_size[0] // patch_size[0],
+                       image_size[1] // patch_size[1])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.patch_shape = patch_shape
+
+        self.projection = nn.Conv2d(num_channels,
+                                    hidden_size,
+                                    kernel_size=patch_size,
+                                    stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values "
+                "match with the one set in the configuration.")
+
+        embeddings = self.projection(
+            pixel_values.to(self.projection.weight.dtype))
+        patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]
+        embeddings = embeddings.flatten(2).transpose(1, 2)
+
+        return embeddings, (patch_height, patch_width)
+
+
+class InternS1VisionEmbeddings(nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        if config.use_mask_token:
+            self.mask_token = nn.Parameter(
+                torch.zeros(1, 1, config.hidden_size))
+        else:
+            self.mask_token = None
+        self.patch_embeddings = InternS1VisionPatchEmbeddings(config)
+        self.patch_size = config.patch_size
+        self.image_size = (config.image_size if isinstance(
+            config.image_size, Iterable) else
+                           (config.image_size, config.image_size))
+        num_patches = self.patch_embeddings.num_patches
+        if config.use_absolute_position_embeddings:
+            self.position_embeddings = nn.Parameter(
+                torch.zeros(1, num_patches + 1, config.hidden_size))
+        else:
+            self.position_embeddings = None
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int,
+                                 width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """  # noqa: E501
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model
+        # works for dynamic input shapes
+        if not torch.jit.is_tracing(
+        ) and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        class_pos_embed = self.position_embeddings[:, :1]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size[0]
+        new_width = width // self.patch_size[1]
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions,
+                                                  sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+    ) -> torch.Tensor:
+        _, _, height, width = pixel_values.shape
+        embeddings, (patch_height,
+                     patch_width) = self.patch_embeddings(pixel_values)
+        batch_size, seq_len, _ = embeddings.size()
+
+        if bool_masked_pos is not None:
+            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
+            # replace the masked visual tokens by mask_tokens
+            w = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1 - w) + mask_tokens * w
+
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        if self.position_embeddings is not None:
+            embeddings = embeddings + self.interpolate_pos_encoding(
+                embeddings, height, width)
+
+        return embeddings, (patch_height, patch_width)
+
+
+class InternSdpaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        *,
+        num_dummy_heads: int = 0,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f'embed_dim must be divisible by num_heads '
+                f'(got `embed_dim`: {self.embed_dim} and `num_heads`:'
+                f' {self.num_heads}).')
+
+        # Additional dummy heads are used to enable TP for common GPU counts.
+        self.dummy_dim = (num_dummy_heads + self.num_heads) * self.head_dim
+
+        self.scale = self.head_dim**-0.5
+
+        self.q_proj = nn.Linear(self.embed_dim,
+                                self.num_heads * self.head_dim,
+                                bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.embed_dim,
+                                self.num_heads * self.head_dim,
+                                bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.embed_dim,
+                                self.num_heads * self.head_dim,
+                                bias=config.attention_bias)
+
+        self.qk_normalization = config.use_qk_norm
+        if self.qk_normalization:
+            self.q_norm = RMSNorm(self.dummy_dim,
+                                  eps=config.layer_norm_eps,
+                                  var_hidden_size=self.embed_dim)
+            self.k_norm = RMSNorm(self.dummy_dim,
+                                  eps=config.layer_norm_eps,
+                                  var_hidden_size=self.embed_dim)
+
+        self.projection_layer = nn.Linear(self.dummy_dim, self.embed_dim)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, N, C = x.shape
+
+        q = self.q_proj(x)
+        k = self.k_proj(x)
+        v = self.v_proj(x)
+
+        q = q.view(B, N, self.num_heads, self.head_dim)
+        k = k.view(B, N, self.num_heads, self.head_dim)
+        v = v.view(B, N, self.num_heads, self.head_dim)
+
+        if self.qk_normalization:
+            B_, N_, H_, D_ = q.shape
+            q = self.q_norm(q.flatten(-2, -1)).view(B_, N_, H_, D_)
+            k = self.k_norm(k.flatten(-2, -1)).view(B_, N_, H_, D_)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        x = F.scaled_dot_product_attention(q, k, v, scale=self.scale)
+        x = x.transpose(1, 2).reshape(B, N, -1)
+
+        x = self.projection_layer(x)
+        return x
+
+
+class InternS1VisionMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(config.hidden_size,
+                                        config.intermediate_size,
+                                        bias=True,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.fc1")
+        self.fc2 = RowParallelLinear(config.intermediate_size,
+                                     config.hidden_size,
+                                     bias=True,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.fc2")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+
+        return hidden_states
+
+
+class InternS1VisionLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_dummy_heads: int = 0,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.attention = self._init_attn(config,
+                                         quant_config,
+                                         num_dummy_heads=num_dummy_heads,
+                                         prefix=f"{prefix}.attention")
+
+        self.mlp = InternS1VisionMLP(config,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.mlp")
+        self.layernorm_before = NORM2FN[config.norm_type](
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = NORM2FN[config.norm_type](
+            config.hidden_size, eps=config.layer_norm_eps)
+
+        init_values = config.layer_scale_init_value
+        self.lambda_1 = nn.Parameter(init_values *
+                                     torch.ones(config.hidden_size),
+                                     requires_grad=True)
+        self.lambda_2 = nn.Parameter(init_values *
+                                     torch.ones(config.hidden_size),
+                                     requires_grad=True)
+
+    def _init_attn(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        *,
+        num_dummy_heads: int,
+        prefix: str = "",
+    ):
+        return InternSdpaAttention(config, num_dummy_heads=num_dummy_heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ):
+        hidden_states = hidden_states + self.attention(
+            self.layernorm_before(hidden_states)) * self.lambda_1
+
+        hidden_states = hidden_states + self.mlp(
+            self.layernorm_after(hidden_states)) * self.lambda_2
+
+        return hidden_states
+
+
+class InternS1VisionEncoder(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_hidden_layers_override: Optional[int] = None,
+        num_dummy_heads: int = 0,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.config = config
+
+        if num_hidden_layers_override is None:
+            num_hidden_layers = config.num_hidden_layers
+        else:
+            num_hidden_layers = num_hidden_layers_override
+
+        self.layer = nn.ModuleList([
+            InternS1VisionLayer(config,
+                                quant_config,
+                                num_dummy_heads=num_dummy_heads,
+                                prefix=f"{prefix}.layer.{layer_idx}")
+            for layer_idx in range(num_hidden_layers)
+        ])
+
+    def forward(self, inputs_embeds: torch.Tensor):
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layer:
+            hidden_states = encoder_layer(hidden_states)
+
+        return hidden_states
+
+
+class InternS1VisionModel(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_hidden_layers_override: Optional[int] = None,
+        num_dummy_heads: int = 0,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        self.embeddings = InternS1VisionEmbeddings(config)
+        self.encoder = InternS1VisionEncoder(
+            config=config,
+            num_hidden_layers_override=num_hidden_layers_override,
+            num_dummy_heads=num_dummy_heads,
+            prefix=f"{prefix}.encoder",
+        )
+        self.layernorm = (nn.Identity() if config.use_mean_pooling else
+                          nn.LayerNorm(config.hidden_size,
+                                       eps=config.layer_norm_eps))
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        if pixel_values is None and pixel_embeds is None:
+            raise ValueError(
+                'You have to specify pixel_values or pixel_embeds')
+
+        if pixel_embeds is not None:
+            hidden_states = pixel_embeds
+        elif pixel_values is not None:
+            if pixel_values.ndim == 4:
+                hidden_states, _ = self.embeddings(pixel_values)
+            else:
+                raise ValueError(
+                    f'wrong pixel_values size: {pixel_values.shape}')
+
+        encoder_outputs = self.encoder(inputs_embeds=hidden_states)
+        encoder_outputs = self.layernorm(encoder_outputs)
+
+        return encoder_outputs
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 9b204fdcbe1a5..709b2eb37a3ed 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -203,6 +203,7 @@ _MULTIMODAL_MODELS = {
     "GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"),  # noqa: E501
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
+    "InternS1ForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"),  # noqa: E501
     "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
     "SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"),  # noqa: E501
     "KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"),

From 05c1126f292ccaf6c57f46cc2c4f5df37db21543 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sat, 26 Jul 2025 20:20:03 +0800
Subject: [PATCH 28/57] [Misc] remove unused try-except in pooling config check
 (#21618)

Signed-off-by: reidliu41 <reid201711@gmail.com>
---
 vllm/transformers_utils/config.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index da475c3b50a39..04ff08825bbc5 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -574,13 +574,11 @@ def get_pooling_config_name(pooling_name: str) -> Union[str, None]:
     supported_pooling_types = ['LAST', 'ALL', 'CLS', 'STEP', 'MEAN']
     pooling_type_name = pooling_name.upper()
 
-    try:
-        if pooling_type_name in supported_pooling_types:
-            return pooling_type_name
-    except NotImplementedError as e:
-        logger.debug("Pooling type not supported", e)
-        return None
-    return None
+    if pooling_type_name in supported_pooling_types:
+        return pooling_type_name
+
+    raise NotImplementedError(
+        f"Pooling type {pooling_type_name} not supported")
 
 
 @cache

From e98def439cc89adba6d704083c5c78dd89b51b5f Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Sat, 26 Jul 2025 06:06:05 -0700
Subject: [PATCH 29/57] [Take 2] Correctly kill vLLM processes after benchmarks
 (#21646)

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 .../nightly-benchmarks/scripts/run-performance-benchmarks.sh   | 3 ++-
 benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh      | 2 ++
 benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh   | 2 ++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index f05040618981c..630943c80c4a7 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -126,7 +126,8 @@ kill_gpu_processes() {
   ps -aux
   lsof -t -i:8000 | xargs -r kill -9
   pgrep python3 | xargs -r kill -9
-
+  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
+  pgrep VLLM | xargs -r kill -9
 
   # wait until GPU memory usage smaller than 1GB
   if command -v nvidia-smi; then
diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
index 94999630bae12..b150b01949658 100644
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -12,6 +12,8 @@ kill_gpu_processes() {
   # kill all processes on GPU.
   pgrep pt_main_thread | xargs -r kill -9
   pgrep python3 | xargs -r kill -9
+  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
+  pgrep VLLM | xargs -r kill -9
   sleep 10
 
   # remove vllm config file
diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
index eb5d891d0d4a5..c5a483f2ff22b 100644
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -18,6 +18,8 @@ kill_gpu_processes() {
   # kill all processes on GPU.
   pgrep pt_main_thread | xargs -r kill -9
   pgrep python3 | xargs -r kill -9
+  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
+  pgrep VLLM | xargs -r kill -9
   for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
   sleep 1
 }

From 9d197280fa60e5f313b15ecca723ef9bec9b5399 Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Sat, 26 Jul 2025 06:08:15 -0700
Subject: [PATCH 30/57] Migrate AriaImagePixelInputs to TensorSchema for shape
 validation (#21620)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 vllm/model_executor/models/aria.py | 50 +++++++++++++-----------------
 1 file changed, 21 insertions(+), 29 deletions(-)

diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 8ae1680a71f3c..e1368a3f6478a 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Optional, TypedDict, Union
+from typing import Annotated, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -29,6 +29,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 # yapf: disable
 from .idefics2_vision_model import Idefics2VisionConfig
@@ -42,15 +43,26 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     merge_multimodal_embeddings)
 
 
-class AriaImagePixelInputs(TypedDict):
-    pixel_values: torch.Tensor
-    pixel_mask: Optional[torch.Tensor]
+class AriaImagePixelInputs(TensorSchema):
     """
-    Shape:
-        pixel_values: `(batch_size * num_images, num_channels, height, width)`
-        pixel_mask: `(batch_size * num_images, height, width)`
+    Dimensions:
+        - b: Batch size
+        - n: Number of images
+        - c: Number of channels
+        - h: Height of each image
+        - w: Width of each image
     """
 
+    pixel_values: Annotated[
+        torch.Tensor,
+        TensorShape("bn", 3, "h", "w"),
+    ]
+
+    pixel_mask: Annotated[
+        Optional[torch.Tensor],
+        TensorShape("bn", "h", "w"),
+    ]
+
 
 class AriaVisionTransformer(Idefics3VisionTransformer, SupportsQuant):
     packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
@@ -540,12 +552,6 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 self.vocab_size, logit_scale)
 
-    def _validate_image_sizes(
-            self, images: list[torch.Tensor]) -> list[torch.Tensor]:
-        if not all(img.shape == images[0].shape for img in images):
-            raise ValueError("All images must be the same size")
-        return images
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[AriaImagePixelInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
@@ -554,23 +560,9 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
         if pixel_values is None:
             return None
 
-        if not isinstance(pixel_values, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of pixel values. "
-                             f"Got type: {type(pixel_values)}")
-
-        pixel_values = self._validate_image_sizes(pixel_values)
-        pixel_values = flatten_bn(pixel_values, concat=True)
-
-        if pixel_mask is not None:
-            if not isinstance(pixel_mask, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel mask. "
-                                 f"Got type: {type(pixel_mask)}")
-
-            pixel_mask = flatten_bn(pixel_mask, concat=True)
-
         return AriaImagePixelInputs(
-            pixel_values=pixel_values,
-            pixel_mask=pixel_mask,
+            pixel_values=flatten_bn(pixel_values, concat=True),
+            pixel_mask=flatten_bn(pixel_mask, concat=True),
         )
 
     def _create_patch_attention_mask(

From de10ff0b7cc757af3d0374d82c1a2130196af496 Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Sat, 26 Jul 2025 06:08:18 -0700
Subject: [PATCH 31/57] Migrate AyaVisionImagePixelInputs to TensorSchema for
 shape validation (#21622)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 vllm/model_executor/models/aya_vision.py | 67 ++++++++++--------------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index 45dd660c89375..a3eee9f065aea 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from https://github.com/huggingface/transformers/tree/main/src/transformers/models/aya_vision
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, TypedDict, Union, cast
+from typing import Annotated, Literal, Optional, Union, cast
 
 import torch
 from torch import nn
@@ -29,6 +29,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .siglip import SiglipVisionModel
@@ -37,18 +38,28 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     merge_multimodal_embeddings)
 
 
-class AyaVisionImagePixelInputs(TypedDict):
+class AyaVisionImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - np: The total number of patches over each image over each prompt in
+              the batch
+        - c: Number of channels
+        - h: Height of each image patch
+        - w: Width of each image patch
+        - bn: Batch size * number of images
+    """
+
     type: Literal["pixel_values"]
-    pixel_values: torch.Tensor
-    """
-    Shape: `(num_patches_total, num_channels, height, width)`
 
-    `num_patches_total` is the total number of patches over each image over each
-    prompt in the batch.
-    """
+    pixel_values: Annotated[
+        torch.Tensor,
+        TensorShape("np", 3, "h", "w"),
+    ]
 
-    num_patches: torch.Tensor
-    """Shape: `(batch_size * num_images)`"""
+    num_patches: Annotated[
+        torch.Tensor,
+        TensorShape("bn"),
+    ]
 
 
 class AyaVisionMultiModalProjector(nn.Module):
@@ -383,21 +394,6 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
             e.flatten(0, 2) for e in image_embeds.split(num_patches.tolist())
         ]
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-        h = w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            if d.shape != expected_dims:
-                raise ValueError(
-                    "The expected shape of pixel values per image per batch "
-                    f"is {expected_dims}. You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[AyaVisionImagePixelInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
@@ -405,22 +401,17 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
         image_embeds = kwargs.pop("image_embeds", None)
         assert image_embeds is None, "Aya Vision does not support image_embeds."
 
-        if not isinstance(pixel_values, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of pixel values. "
-                             f"Got type: {type(pixel_values)}")
-        if num_patches is not None and not isinstance(num_patches,
-                                                      (torch.Tensor, list)):
-            raise ValueError("Incorrect type of num_patches. "
-                             f"Got type: {type(num_patches)}")
-
-        pixel_values = flatten_bn(pixel_values, concat=True)
-        num_patches = flatten_bn(num_patches, concat=True)
+        if pixel_values is None:
+            return None
 
         return AyaVisionImagePixelInputs(
             type="pixel_values",
-            pixel_values=self._validate_pixel_values(pixel_values),
-            num_patches=num_patches,
-        )
+            pixel_values=flatten_bn(pixel_values, concat=True),
+            num_patches=flatten_bn(num_patches, concat=True),
+            resolve_bindings={
+                "h": self.config.vision_config.image_size,
+                "w": self.config.vision_config.image_size,
+            })
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model

From f27fdfc3ed79689ec98ef1c609a31d8c3b11e6a9 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 26 Jul 2025 21:09:29 +0800
Subject: [PATCH 32/57] [Bugfix] Investigate Qwen2-VL failing test (#21527)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/models/multimodal/generation/test_common.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index e2e35e9b27218..b1483abf627ba 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -677,6 +677,7 @@ VLM_TEST_SETTINGS = {
         prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
         img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
         video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
+        multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.",    # noqa: E501
         max_model_len=4096,
         max_num_seqs=2,
         auto_cls=AutoModelForVision2Seq,

From 1cd6eaba54a2461bcdca3958e5befd9c40907657 Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <mbayser@br.ibm.com>
Date: Sat, 26 Jul 2025 10:09:52 -0300
Subject: [PATCH 33/57] Support encoder-only models without KV-Cache (#21270)

Signed-off-by: Max de Bayser <maxdebayser@gmail.com>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
---
 .../prithvi_geospatial_mae.py                 |   2 +-
 tests/conftest.py                             |  13 +-
 .../test_model_load_with_params.py            |  12 +-
 .../models/language/pooling/test_embedding.py |  14 +-
 tests/models/language/pooling/test_jina.py    |   8 +
 tests/v1/attention/utils.py                   |   1 +
 tests/v1/test_oracle.py                       |   1 -
 tests/v1/test_utils.py                        |   3 +-
 vllm/engine/arg_utils.py                      |   3 +-
 vllm/model_executor/models/bert.py            |  18 +-
 vllm/model_executor/models/roberta.py         |  85 ++++++--
 vllm/v1/attention/backends/flash_attn.py      |  91 ++++++++-
 vllm/v1/attention/backends/utils.py           |   3 +
 vllm/v1/engine/core.py                        |   6 +
 vllm/v1/spec_decode/eagle.py                  |   1 +
 vllm/v1/worker/cpu_model_runner.py            |   4 +
 vllm/v1/worker/gpu_model_runner.py            | 186 ++++++++++++++----
 17 files changed, 352 insertions(+), 99 deletions(-)

diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py
index 4fdc7a3cf709e..b6007b9f46301 100644
--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -3,12 +3,12 @@
 import argparse
 import datetime
 import os
-import re
 from typing import Union
 
 import albumentations
 import numpy as np
 import rasterio
+import regex as re
 import torch
 from einops import rearrange
 from terratorch.datamodules import Sen1Floods11NonGeoDataModule
diff --git a/tests/conftest.py b/tests/conftest.py
index a18dbf58c803d..fd4956bdb24cc 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1062,8 +1062,17 @@ class VllmRunner:
         return [req_output.outputs.score for req_output in req_outputs]
 
     def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
-        executor = self.llm.llm_engine.model_executor
-        return executor.apply_model(func)
+        if hasattr(self.llm.llm_engine, "model_executor"):
+            # This works either in V0 or in V1 with
+            # VLLM_ENABLE_V1_MULTIPROCESSING=0
+            executor = self.llm.llm_engine.model_executor
+            return executor.apply_model(func)
+
+        # This works in V1 with VLLM_ALLOW_INSECURE_SERIALIZATION=1
+        def _apply_model(self):
+            return func(self.get_model())
+
+        return self.llm.llm_engine.collective_rpc(_apply_model)
 
     def __enter__(self):
         return self
diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
index aae9a4d1ef11d..667a63e769324 100644
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -22,10 +22,12 @@ REVISION_ROBERTA = os.environ.get("REVISION", "main")
 
 @pytest.mark.skipif(current_platform.is_rocm(),
                     reason="Xformers backend is not supported on ROCm.")
-def test_model_loading_with_params(vllm_runner):
+def test_model_loading_with_params(vllm_runner, monkeypatch):
     """
     Test parameter weight loading with tp>1.
     """
+    # to use apply_model
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
     with vllm_runner(model_name=MODEL_NAME,
                      revision=REVISION,
                      dtype="float16",
@@ -61,10 +63,12 @@ def test_model_loading_with_params(vllm_runner):
 
 @pytest.mark.skipif(current_platform.is_rocm(),
                     reason="Xformers backend is not supported on ROCm.")
-def test_roberta_model_loading_with_params(vllm_runner):
+def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
     """
     Test parameter weight loading with tp>1.
     """
+    # to use apply_model
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
     with vllm_runner(model_name=MODEL_NAME_ROBERTA,
                      revision=REVISION_ROBERTA,
                      dtype="float16",
@@ -101,10 +105,12 @@ def test_roberta_model_loading_with_params(vllm_runner):
 
 @pytest.mark.skipif(current_platform.is_rocm(),
                     reason="Xformers backend is not supported on ROCm.")
-def test_facebook_roberta_model_loading_with_params(vllm_runner):
+def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch):
     """
     Test loading roberta-base model with no lm_head.
     """
+    # to use apply_model
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
     model_name = "FacebookAI/roberta-base"
     with vllm_runner(model_name=model_name,
                      dtype="float16",
diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
index cc9e4102d5b79..ba42e389fc150 100644
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -39,17 +39,9 @@ def v1(run_with_both_engines):
         pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
                      marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
         # [Encoder-only]
-        pytest.param(
-            "BAAI/bge-base-en-v1.5",
-            marks=[
-                # CPU only supports V1
-                pytest.mark.core_model,
-                pytest.mark.skip_v1
-            ]),
-        pytest.param("sentence-transformers/all-MiniLM-L12-v2",
-                     marks=[pytest.mark.skip_v1]),
-        pytest.param("intfloat/multilingual-e5-small",
-                     marks=[pytest.mark.skip_v1]),
+        pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
+        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
+        pytest.param("intfloat/multilingual-e5-small"),
         pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
                      marks=[pytest.mark.skip_v1]),
         # [Cross-Encoder]
diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py
index 16c711407aeae..a4681baa51efc 100644
--- a/tests/models/language/pooling/test_jina.py
+++ b/tests/models/language/pooling/test_jina.py
@@ -23,6 +23,14 @@ RERANK_MODELS = [
 ]
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 @pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
 def test_embed_models_mteb(hf_runner, vllm_runner,
                            model_info: EmbedModelInfo) -> None:
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index 69bd4a2060ae5..ae2ab6e6413c0 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -93,6 +93,7 @@ def create_common_attn_metadata(
         max_query_len=max_query_len,
         block_table_tensor=block_table_tensor,
         slot_mapping=slot_mapping,
+        causal=True,
     )
 
 
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
index b4d4348c7fd9b..cc59287a9fbe6 100644
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -13,7 +13,6 @@ UNSUPPORTED_MODELS_V1 = [
     "openai/whisper-large-v3",  # transcription
     "facebook/bart-large-cnn",  # encoder decoder
     "state-spaces/mamba-130m-hf",  # mamba1
-    "BAAI/bge-m3",  # embedding
 ]
 
 MODEL = "meta-llama/Llama-3.2-1B-Instruct"
diff --git a/tests/v1/test_utils.py b/tests/v1/test_utils.py
index 0b892bd9dffdc..00d98a873a310 100644
--- a/tests/v1/test_utils.py
+++ b/tests/v1/test_utils.py
@@ -1,9 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import re
-
 import pytest
+import regex as re
 import requests
 import torch
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 7099680047182..b7dbff397d2f2 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1649,7 +1649,8 @@ class EngineArgs:
 
         if (self.max_num_seqs is None
                 and usage_context in default_max_num_seqs):
-            self.max_num_seqs = default_max_num_seqs[usage_context]
+            self.max_num_seqs = min(default_max_num_seqs[usage_context],
+                                    self.max_num_batched_tokens or sys.maxsize)
 
             logger.debug("Setting max_num_seqs to %d for %s usage context.",
                          self.max_num_seqs, use_context_value)
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index c3066aaa2b87d..504621c8abd8f 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -12,7 +12,6 @@ from vllm.attention import Attention, AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, PoolerConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
@@ -60,7 +59,6 @@ class BertEmbedding(nn.Module):
     def forward(
         self,
         input_ids: torch.Tensor,
-        seq_lens: torch.Tensor,
         position_ids: torch.Tensor,
         token_type_ids: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
@@ -119,7 +117,6 @@ class BertPooler(Pooler):
         return pooled_output
 
 
-@support_torch_compile
 class BertEncoder(nn.Module):
 
     def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
@@ -337,6 +334,7 @@ class BertOutput(nn.Module):
         return hidden_states
 
 
+@support_torch_compile
 class BertModel(nn.Module, SupportsQuant):
 
     is_pooling_model = True
@@ -368,13 +366,9 @@ class BertModel(nn.Module, SupportsQuant):
         if inputs_embeds is not None:
             hidden_states = inputs_embeds
         else:
-            attn_metadata = get_forward_context().attn_metadata
-            assert hasattr(attn_metadata, "seq_lens_tensor")
-            hidden_states = self.embeddings(
-                input_ids=input_ids,
-                seq_lens=attn_metadata.seq_lens_tensor,
-                position_ids=position_ids,
-                token_type_ids=token_type_ids)
+            hidden_states = self.embeddings(input_ids=input_ids,
+                                            position_ids=position_ids,
+                                            token_type_ids=token_type_ids)
         return self.encoder(hidden_states)
 
     def _load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
@@ -447,7 +441,7 @@ class BertPoolingModel(BertModel):
         return loaded_params
 
 
-class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant):
+class BertEmbeddingModel(nn.Module, SupportsQuant):
     """A model that uses Bert to provide embedding functionalities.
 
     This class encapsulates the BertModel and provides an interface for
@@ -474,11 +468,13 @@ class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant):
         self,
         input_ids: Optional[torch.Tensor],
         positions: torch.Tensor,
+        token_type_ids: Optional[torch.Tensor] = None,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         return self.model(input_ids=input_ids,
                           position_ids=positions,
+                          token_type_ids=token_type_ids,
                           inputs_embeds=inputs_embeds,
                           intermediate_tensors=intermediate_tensors)
 
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index c6b4116440346..77e072c792755 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -9,6 +9,7 @@ from torch import nn
 from transformers import RobertaConfig
 
 from vllm.config import VllmConfig
+from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.pooler import (ClassifierPooler, CLSPool,
                                                DispatchPooler, Pooler)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -51,33 +52,12 @@ class RobertaEmbedding(nn.Module):
     def forward(
         self,
         input_ids: torch.Tensor,
-        seq_lens: torch.Tensor,
         position_ids: torch.Tensor,
         token_type_ids: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         input_shape = input_ids.size()
         inputs_embeds = self.word_embeddings(input_ids)
 
-        # Replace position ids because in RoBERTa models
-        # they have to start at padding_idx + 1 and ignore
-        # existing padding tokens
-        # References:
-        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133
-        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669
-        seq_lens_list = seq_lens.tolist()
-        new_pos_list = []
-        for positions, tokens in zip(position_ids.split(seq_lens_list),
-                                     input_ids.split(seq_lens_list)):
-            # Verify assumption that incoming position are
-            # always a sequence from 0 to N.
-            expected_pos = torch.arange(positions.size()[0],
-                                        dtype=torch.long,
-                                        device=inputs_embeds.device)
-            assert torch.equal(positions, expected_pos)
-            new_pos_list.append(
-                create_position_ids_from_input_ids(tokens, self.padding_idx))
-        position_ids = torch.cat(new_pos_list)
-
         # Position embeddings.
         position_embeddings = self.position_embeddings(position_ids)
         if token_type_ids is None:
@@ -119,6 +99,32 @@ class RobertaEmbeddingModel(BertEmbeddingModel):
        _pooler: An instance of Pooler used for pooling operations.
    """
 
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        self.padding_idx = vllm_config.model_config.hf_config.pad_token_id
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        token_type_ids: Optional[torch.Tensor] = None,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        # Fix Roberta positions here outside of the CUDA graph.
+        # Because we need the to extract the sequences from
+        # input_ids the control flow is data dependent.
+        replace_roberta_positions(input_ids=input_ids,
+                                  position_ids=positions,
+                                  padding_idx=self.padding_idx)
+
+        return self.model(input_ids=input_ids,
+                          position_ids=positions,
+                          token_type_ids=token_type_ids,
+                          inputs_embeds=inputs_embeds,
+                          intermediate_tensors=intermediate_tensors)
+
     def _build_model(self,
                      vllm_config: VllmConfig,
                      prefix: str = "") -> Union[BertModel, BertWithRope]:
@@ -175,6 +181,7 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
+        self.padding_idx = vllm_config.model_config.hf_config.pad_token_id
 
         self.num_labels = config.num_labels
         self.roberta = BertModel(vllm_config=vllm_config,
@@ -216,6 +223,9 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
         inputs_embeds: Optional[torch.Tensor] = None,
         token_type_ids: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        replace_roberta_positions(input_ids=input_ids,
+                                  position_ids=positions,
+                                  padding_idx=self.padding_idx)
         return self.roberta(input_ids=input_ids,
                             position_ids=positions,
                             inputs_embeds=inputs_embeds,
@@ -245,3 +255,36 @@ def create_position_ids_from_input_ids(input_ids,
                            past_key_values_length) * mask
 
     return incremental_indices.long() + padding_idx
+
+
+def replace_roberta_positions(input_ids: torch.Tensor,
+                              position_ids: torch.Tensor,
+                              padding_idx: int) -> None:
+
+    seq_lens: Optional[torch.Tensor] = None
+    attn_metadata = get_forward_context().attn_metadata
+    if attn_metadata is not None:  # can be None during warmup
+        if isinstance(attn_metadata, dict):
+            attn_metadata = next(iter(attn_metadata.values()))
+        # TODO: remove "seq_lens_tensor" after V0 is removed
+        seq_lens = getattr(attn_metadata, "seq_lens_tensor",
+                           getattr(attn_metadata, "seq_lens", None))
+
+    if seq_lens is not None:
+        assert isinstance(seq_lens, torch.Tensor)
+
+        # Replace position ids because in RoBERTa models
+        # they have to start at padding_idx + 1 and ignore
+        # existing padding tokens
+        # References:
+        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133
+        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669
+        token_list = torch.split(input_ids[:torch.sum(seq_lens)],
+                                 seq_lens.tolist())
+
+        offset = 0
+        for tokens in token_list:
+            length = tokens.shape[0]
+            position_ids[offset:offset+length] = \
+                create_position_ids_from_input_ids(tokens, padding_idx)
+            offset = offset + length
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 5fe274f2c65b2..7c8a5e056fea5 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -130,6 +130,8 @@ class FlashAttentionMetadata:
     prefix_scheduler_metadata: Optional[torch.Tensor] = None
     max_num_splits: int = 0
 
+    causal: bool = True
+
 
 def _get_sliding_window_configs(
         vllm_config: VllmConfig) -> set[Optional[tuple[int, int]]]:
@@ -213,6 +215,7 @@ class FlashAttentionMetadataBuilder(
         seq_lens_cpu = common_attn_metadata.seq_lens_cpu
         block_table_tensor = common_attn_metadata.block_table_tensor
         slot_mapping = common_attn_metadata.slot_mapping
+        causal = common_attn_metadata.causal
 
         # the overhead of the aot schedule is not worth it for spec-decode
         aot_schedule = self.aot_schedule and not fast_build
@@ -288,7 +291,7 @@ class FlashAttentionMetadataBuilder(
                                           max_query_len=max_query_len,
                                           seqlens=seq_lens,
                                           max_seq_len=max_seq_len,
-                                          causal=True)
+                                          causal=causal)
 
         if self.use_full_cuda_graph:
             assert scheduler_metadata is not None
@@ -326,7 +329,7 @@ class FlashAttentionMetadataBuilder(
             suffix_kv_lens=suffix_kv_lens,
             prefix_scheduler_metadata=prefix_scheduler_metadata,
             max_num_splits=max_num_splits,
-        )
+            causal=causal)
         return attn_metadata
 
     def can_run_in_cudagraph(
@@ -375,11 +378,14 @@ class FlashAttentionImpl(AttentionImpl):
 
         FlashAttentionBackend.validate_head_size(head_size)
 
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
+        if attn_type not in [
+                AttentionType.DECODER, AttentionType.ENCODER_ONLY
+        ]:
+            raise NotImplementedError("Encoder/decoder cross-attention "
+                                      "is not implemented for "
                                       "FlashAttentionImpl")
+
+        self.attn_type = attn_type
         self.vllm_flash_attn_version = get_flash_attn_version()
         if is_quantized_kv_cache(self.kv_cache_dtype) \
             and not flash_attn_supports_fp8():
@@ -422,6 +428,8 @@ class FlashAttentionImpl(AttentionImpl):
             # Profiling run.
             return output
 
+        attn_type = self.attn_type
+
         # IMPORTANT!
         # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
         # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead
@@ -432,6 +440,18 @@ class FlashAttentionImpl(AttentionImpl):
         # performance to make sure it does not introduce any overhead.
 
         num_actual_tokens = attn_metadata.num_actual_tokens
+
+        # Handle encoder attention differently - no KV cache needed
+        if attn_type in (AttentionType.ENCODER_ONLY, ):
+            # For encoder attention,
+            # we use direct Q, K, V tensors without caching
+            return self._forward_encoder_attention(query[:num_actual_tokens],
+                                                   key[:num_actual_tokens],
+                                                   value[:num_actual_tokens],
+                                                   output[:num_actual_tokens],
+                                                   attn_metadata, layer)
+
+        # For decoder and cross-attention, use KV cache as before
         key_cache, value_cache = kv_cache.unbind(0)
 
         if self.kv_sharing_target_layer_name is None:
@@ -483,7 +503,7 @@ class FlashAttentionImpl(AttentionImpl):
                 seqused_k=seqused_k,
                 max_seqlen_k=max_seqlen_k,
                 softmax_scale=self.scale,
-                causal=True,
+                causal=attn_metadata.causal,
                 alibi_slopes=self.alibi_slopes,
                 window_size=self.sliding_window,
                 block_table=block_table,
@@ -524,6 +544,63 @@ class FlashAttentionImpl(AttentionImpl):
         )
         return output
 
+    def _forward_encoder_attention(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        output: torch.Tensor,
+        attn_metadata: FlashAttentionMetadata,
+        layer: torch.nn.Module,
+    ) -> torch.Tensor:
+        """Forward pass for encoder attention without KV cache.
+
+        Args:
+            query: shape = [num_encoder_tokens, num_heads, head_size]
+            key: shape = [num_encoder_tokens, num_kv_heads, head_size]
+            value: shape = [num_encoder_tokens, num_kv_heads, head_size]
+            output: shape = [num_encoder_tokens, num_heads, head_size]
+            attn_metadata: Encoder attention metadata
+            layer: The attention layer
+        """
+        # For encoder attention, process FP8 quantization if needed
+        if self.kv_cache_dtype.startswith("fp8"):
+            raise NotImplementedError(
+                "quantization is not supported for encoder attention")
+
+        # Use encoder-specific metadata for sequence information
+        cu_seqlens_q = attn_metadata.query_start_loc
+        cu_seqlens_k = attn_metadata.query_start_loc
+        max_seqlen_q = attn_metadata.max_query_len
+        max_seqlen_k = attn_metadata.max_query_len
+
+        descale_shape = (
+            cu_seqlens_q.shape[0] - 1,  # type: ignore[union-attr]
+            self.num_kv_heads)
+
+        # Call flash attention directly on Q, K, V tensors
+        flash_attn_varlen_func(
+            q=query,
+            k=key,
+            v=value,
+            out=output,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
+            softmax_scale=self.scale,
+            causal=False,  # Encoder attention is bidirectional
+            alibi_slopes=self.alibi_slopes,
+            window_size=self.sliding_window,
+            softcap=self.logits_soft_cap,
+            fa_version=self.vllm_flash_attn_version,
+            q_descale=layer._q_scale.expand(descale_shape),
+            k_descale=layer._k_scale.expand(descale_shape),
+            v_descale=layer._v_scale.expand(descale_shape),
+        )
+
+        return output
+
 
 def use_cascade_attention(
     common_prefix_len: int,
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index fc8649d587eee..b13362f8a8d8d 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -59,6 +59,8 @@ class CommonAttentionMetadata:
     block_table_tensor: torch.Tensor
     slot_mapping: torch.Tensor
 
+    causal: bool = True
+
 
 M = TypeVar("M")
 
@@ -395,6 +397,7 @@ def make_local_attention_virtual_batches(
         max_query_len=seqlens_q_local.max(),
         block_table_tensor=block_table_local,
         slot_mapping=common_attn_metadata.slot_mapping,
+        causal=True,
     )
 
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 4124ee05326ce..57f60c4b289bb 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -111,6 +111,12 @@ class EngineCore:
                 "compatibility may not be maintained.",
                 vllm_config.scheduler_config.scheduler_cls)
 
+        if len(kv_cache_config.kv_cache_groups) == 0:
+            # Encoder models without KV cache don't support
+            # chunked prefill. But do SSM models?
+            logger.info("Disabling chunked prefill for model without KVCache")
+            vllm_config.scheduler_config.chunked_prefill_enabled = False
+
         self.scheduler: SchedulerInterface = Scheduler(
             vllm_config=vllm_config,
             kv_cache_config=kv_cache_config,
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 967847c02ff2f..63f6fc276189d 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -330,6 +330,7 @@ class EagleProposer:
             max_query_len=new_query_len_per_req.max().item(),
             block_table_tensor=common_attn_metadata.block_table_tensor,
             slot_mapping=common_attn_metadata.slot_mapping[token_indices],
+            causal=True,
         )
 
         return spec_common_attn_metadata, token_indices
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index ca94ac8c60545..6b2b50a57e1f8 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -4,6 +4,7 @@ from contextlib import contextmanager
 from typing import Any
 
 import torch
+import torch.nn as nn
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
@@ -59,6 +60,9 @@ class CPUModelRunner(GPUModelRunner):
                                               self.scheduler_config,
                                               self.lora_config, self.device)
 
+    def get_model(self) -> nn.Module:
+        return self.model
+
     def warming_up_model(self) -> None:
         logger.info("Warming up model for the compilation...")
         # Only generate graph for the generic shape
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 6ddb2c422dff7..a1eef5e573edb 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -126,6 +126,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         self.is_multimodal_model = model_config.is_multimodal_model
         self.is_pooling_model = model_config.pooler_config is not None
+        self.is_encoder_only_model = False
         self.model_supports_multimodal_raw_input = (
             model_config.model_supports_multimodal_raw_input)
         self.max_model_len = model_config.max_model_len
@@ -735,6 +736,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         spec_decode_common_attn_metadata = None
 
         attn_metadata: dict[str, Any] = {}
+
+        # Prepare encoder attention metadata separately
+        # (encoder layers are not in KV cache groups)
+        if self.is_encoder_only_model:
+            common_attn_metadata, encoder_attn_metadata = \
+                self._build_encoder_only_attn_metadata(
+                scheduler_output)
+
+            # Add encoder attention metadata for all encoder layers
+            attention_layers = get_layers_from_vllm_config(
+                self.vllm_config, Attention)
+            for layer_name, attn_module in attention_layers.items():
+                if attn_module.attn_type == AttentionType.ENCODER_ONLY:
+                    attn_metadata[layer_name] = encoder_attn_metadata
+
         # Prepare the attention metadata for each KV cache group and make layers
         # in the same group share the same metadata.
         for kv_cache_group_id, kv_cache_group_spec in enumerate(
@@ -760,6 +776,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 max_query_len=max_num_scheduled_tokens,
                 block_table_tensor=blk_table_tensor,
                 slot_mapping=slot_mapping,
+                causal=True,
             )
 
             if self.speculative_config and \
@@ -2102,7 +2119,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     block_table_tensor=self.input_batch.block_table[
                         kv_cache_group_id].get_device_tensor()[:num_reqs],
                     slot_mapping=self.input_batch.
-                    block_table[kv_cache_group_id].slot_mapping[:num_tokens])
+                    block_table[kv_cache_group_id].slot_mapping[:num_tokens],
+                    causal=True)
 
                 attn_metadata_i = self.attn_metadata_builders[
                     kv_cache_group_id].build_for_cudagraph_capture(
@@ -2466,6 +2484,49 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
                     elapsed_time, cuda_graph_size / (1 << 30))
 
+    def _initialize_single_attn_backend(
+        self, kv_cache_spec: KVCacheSpec
+    ) -> tuple[AttentionBackend, AttentionMetadataBuilder]:
+        if isinstance(kv_cache_spec, AttentionSpec):
+            attn_backend_i = get_attn_backend(
+                kv_cache_spec.head_size,
+                self.dtype,
+                kv_cache_spec.dtype,
+                kv_cache_spec.block_size,
+                self.model_config.is_attention_free,
+                use_mla=kv_cache_spec.use_mla,
+            )
+            if attn_backend_i is None:
+                error_msg = (f"Error with get_attn_backend: "
+                             f"{kv_cache_spec.head_size=}, "
+                             f"{self.dtype=}, {kv_cache_spec.dtype=}, "
+                             f"{kv_cache_spec.block_size=}, "
+                             f"{self.model_config.is_attention_free=}, "
+                             f"{kv_cache_spec.use_mla=}")
+                logger.error(error_msg)
+                raise NotImplementedError(
+                    "Non-Attention backend is not supported by V1 "
+                    "GPUModelRunner.")
+        elif isinstance(kv_cache_spec, MambaSpec):
+            attn_backend_i = Mamba2AttentionBackend
+        else:
+            raise ValueError(
+                f"Unknown KV cache spec type: {type(kv_cache_spec)}")
+
+        attn_metadata_builder_i = attn_backend_i.get_builder_cls()(
+            kv_cache_spec,
+            self.vllm_config,
+            self.device,
+        )
+
+        if (self.full_cuda_graph
+                and not attn_metadata_builder_i.full_cudagraph_supported):
+            raise ValueError(
+                f"Full CUDAGraph not supported for "
+                f"{attn_backend_i.__name__}. Turn off CompilationConfig."
+                f"full_cuda_graph or use a different attention backend.")
+        return attn_backend_i, attn_metadata_builder_i
+
     def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Initialize the attention backends and attention metadata builders.
@@ -2476,48 +2537,45 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         for i, kv_cache_group_spec in enumerate(
                 kv_cache_config.kv_cache_groups):
             kv_cache_spec = kv_cache_group_spec.kv_cache_spec
-            if isinstance(kv_cache_spec, AttentionSpec):
-                attn_backend_i = get_attn_backend(
-                    kv_cache_spec.head_size,
-                    self.dtype,
-                    kv_cache_spec.dtype,
-                    kv_cache_spec.block_size,
-                    self.model_config.is_attention_free,
-                    use_mla=kv_cache_spec.use_mla,
-                )
-                if attn_backend_i is None:
-                    error_msg = (f"Error with get_attn_backend: "
-                                 f"{kv_cache_spec.head_size=}, "
-                                 f"{self.dtype=}, {kv_cache_spec.dtype=}, "
-                                 f"{kv_cache_spec.block_size=}, "
-                                 f"{self.model_config.is_attention_free=}, "
-                                 f"{kv_cache_spec.use_mla=}")
-                    logger.error(error_msg)
-                    raise NotImplementedError(
-                        "Non-Attention backend is not supported by V1 "
-                        "GPUModelRunner.")
-            elif isinstance(kv_cache_spec, MambaSpec):
-                attn_backend_i = Mamba2AttentionBackend
-            else:
-                raise ValueError(
-                    f"Unknown KV cache spec type: {type(kv_cache_spec)}")
-
-            attn_metadata_builder_i = attn_backend_i.get_builder_cls()(
-                kv_cache_spec,
-                self.vllm_config,
-                self.device,
-            )
-
-            if (self.full_cuda_graph
-                    and not attn_metadata_builder_i.full_cudagraph_supported):
-                raise ValueError(
-                    f"Full CUDAGraph not supported for "
-                    f"{attn_backend_i.__name__}. Turn off CompilationConfig."
-                    f"full_cuda_graph or use a different attention backend.")
 
+            attn_backend_i, attn_metadata_builder_i = \
+                self._initialize_single_attn_backend(kv_cache_spec)
             self.attn_backends.append(attn_backend_i)
             self.attn_metadata_builders.append(attn_metadata_builder_i)
 
+        if len(self.attn_backends) > 0:
+            return
+
+        # Check if model is encoder-only
+        block_size = self.vllm_config.cache_config.block_size
+        use_mla = self.vllm_config.model_config.use_mla
+        attn_specs = list[AttentionSpec]()
+        attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
+        for attn_module in attn_layers.values():
+
+            if attn_module.attn_type == AttentionType.ENCODER_ONLY:
+                assert attn_module.sliding_window is None, "Sliding "
+                "window attention is not supported for encoder-only models"
+
+                attn_specs.append(
+                    FullAttentionSpec(block_size=block_size,
+                                      num_kv_heads=attn_module.num_kv_heads,
+                                      head_size=attn_module.head_size,
+                                      dtype=self.kv_cache_dtype,
+                                      use_mla=use_mla))
+            else:
+                raise ValueError("Expected only encoder-only layers")
+
+        if len(attn_specs) > 0:
+            assert len(attn_specs) == len(attn_layers), \
+                "All or none of the layers are expected to be encoder-only"
+
+            attn_backend, attn_metadata_builder = \
+                self._initialize_single_attn_backend(attn_specs[0])
+            self.attn_backends.append(attn_backend)
+            self.attn_metadata_builders.append(attn_metadata_builder)
+            self.is_encoder_only_model = True
+
     def may_reinitialize_input_batch(self,
                                      kv_cache_config: KVCacheConfig) -> None:
         """
@@ -2833,3 +2891,53 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     page_size_padded=page_size_padded)
 
         return kv_cache_spec
+
+    def _build_encoder_only_attn_metadata(
+            self, scheduler_output: "SchedulerOutput") -> \
+                tuple[CommonAttentionMetadata, Any]:
+        """Prepare encoder attention metadata for encoder-only models.
+
+        Args:
+            scheduler_output: Scheduler output
+
+        Returns:
+            dict[str, Any]: Encoder attention metadata
+        """
+        num_reqs = self.input_batch.num_reqs
+        total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
+
+        # Get the number of scheduled tokens for each request.
+        req_ids = self.input_batch.req_ids
+        tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
+        max_num_scheduled_tokens = max(tokens)
+
+        # Use the first attention metadata builder
+        # to create encoder attention metadata
+        builder = self.attn_metadata_builders[0]
+
+        dummy_block_table = torch.zeros((num_reqs, 1),
+                                        dtype=torch.int32,
+                                        device=self.device)
+        dummy_slot_mapping = torch.zeros((total_num_scheduled_tokens, ),
+                                         dtype=torch.int32,
+                                         device=self.device)
+
+        common_metadata = CommonAttentionMetadata(
+            query_start_loc=self.query_start_loc[:num_reqs + 1],
+            query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs + 1],
+            seq_lens=self.seq_lens[:num_reqs],
+            seq_lens_cpu=self.seq_lens_cpu[:num_reqs],
+            num_computed_tokens_cpu=self.input_batch.
+            num_computed_tokens_cpu_tensor[:num_reqs],
+            num_reqs=num_reqs,
+            num_actual_tokens=total_num_scheduled_tokens,
+            max_query_len=max_num_scheduled_tokens,
+            block_table_tensor=dummy_block_table,
+            slot_mapping=dummy_slot_mapping,
+            causal=False,
+        )
+
+        return common_metadata, builder.build(
+            common_prefix_len=0,  # No cascade for encoder
+            common_attn_metadata=common_metadata,
+        )

From c215f5c877e0518f328613a98f31bb9cdb32a889 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Sat, 26 Jul 2025 10:06:14 -0400
Subject: [PATCH 34/57] [Bug] Fix `has_flashinfer_moe` Import Error when it is
 not installed (#21634)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/utils/flashinfer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index b25e3a49f1812..ebc54fd029da6 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -82,7 +82,8 @@ autotune = _lazy_import_wrapper(
 @functools.cache
 def has_flashinfer_moe() -> bool:
     """Return ``True`` if FlashInfer MoE module is available."""
-    return importlib.util.find_spec("flashinfer.fused_moe") is not None
+    return has_flashinfer() and importlib.util.find_spec(
+        "flashinfer.fused_moe") is not None
 
 
 @functools.cache

From a40a8506df78b965f034aeae260e664c63a7f5d5 Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <yeq@meta.com>
Date: Sat, 26 Jul 2025 07:07:21 -0700
Subject: [PATCH 35/57] [Misc] Improve memory profiling debug message (#21429)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
---
 vllm/v1/worker/gpu_worker.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index dcfb038d28c20..d9d1f14f0554c 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -246,11 +246,21 @@ class Worker(WorkerBase):
         available_kv_cache_memory = self.requested_memory \
             - profile_result.non_kv_cache_memory
 
+        unrequested_memory = self.init_snapshot.free_memory \
+            - self.requested_memory
         logger.debug(
-            "Initial free memory: %.2f GiB, free memory: %.2f GiB, "
-            "requested GPU memory: %.2f GiB",
-            GiB(self.init_snapshot.free_memory), GiB(free_gpu_memory),
-            GiB(self.requested_memory))
+            "Initial free memory: %.2f GiB; "
+            "Requested memory: %.2f (util), %.2f GiB",
+            GiB(self.init_snapshot.free_memory),
+            self.cache_config.gpu_memory_utilization,
+            GiB(self.requested_memory),
+        )
+        logger.debug(
+            "Free memory after profiling: %.2f GiB (total), "
+            "%.2f GiB (within requested)",
+            GiB(free_gpu_memory),
+            GiB(free_gpu_memory - unrequested_memory),
+        )
         logger.debug(profile_result)
         logger.info("Available KV cache memory: %.2f GiB",
                     GiB(available_kv_cache_memory))

From 97d6c30cc965e70579bfdad27e7514592752096e Mon Sep 17 00:00:00 2001
From: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Date: Sat, 26 Jul 2025 22:07:40 +0800
Subject: [PATCH 36/57] [BugFix] Fix shared storage connector load kv only load
 attention layer (#21428)

Signed-off-by: David Chen <530634352@qq.com>
---
 .../kv_connector/v1/shared_storage_connector.py      | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
index 3c574d0655717..048748e6b8ecb 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -156,8 +156,16 @@ class SharedStorageConnector(KVConnectorBase_V1):
             logger.info("Inject KV cache of %d tokens to the paged memory",
                         len(request.slot_mapping))
             for layer_name in forward_context.no_compile_layers:
-                attn_layer = forward_context.no_compile_layers[layer_name]
-                kv_cache_layer = attn_layer.kv_cache[\
+                layer = forward_context.no_compile_layers[layer_name]
+
+                # Only process layers that have kv_cache
+                # attribute (attention layers) Skip non-attention
+                # layers like FusedMoE/MLP etc.
+                kv_cache_attr = getattr(layer, 'kv_cache', None)
+                if kv_cache_attr is None:
+                    continue
+
+                kv_cache_layer = kv_cache_attr[ \
                         forward_context.virtual_engine]
 
                 filename = self._generate_filename_debug(

From 56e544f24b62d647ba05cccf51756c2ebebae66f Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Sat, 26 Jul 2025 10:08:29 -0400
Subject: [PATCH 37/57] [Refactor] Remove `moe_align_block_size_triton`
 (#21335)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 .../kernels/benchmark_moe_align_block_size.py |  90 +----------
 .../layers/fused_moe/moe_align_block_size.py  | 140 +-----------------
 2 files changed, 6 insertions(+), 224 deletions(-)

diff --git a/benchmarks/kernels/benchmark_moe_align_block_size.py b/benchmarks/kernels/benchmark_moe_align_block_size.py
index 1af5a21caf465..f540cff6261a8 100644
--- a/benchmarks/kernels/benchmark_moe_align_block_size.py
+++ b/benchmarks/kernels/benchmark_moe_align_block_size.py
@@ -5,9 +5,8 @@ import itertools
 
 import torch
 
-from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
-    moe_align_block_size_triton,
+    moe_align_block_size,
 )
 from vllm.triton_utils import triton
 
@@ -21,60 +20,6 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
     )
 
 
-def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8):
-    """
-    Verifies vllm vs. Triton
-    """
-    topk_ids = get_topk_ids(num_tokens, num_experts, topk)
-
-    # 1. malloc space for triton and vllm
-    # malloc enough space (max_num_tokens_padded) for the sorted ids
-    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
-    sorted_ids_triton = torch.empty(
-        (max_num_tokens_padded,), dtype=torch.int32, device="cuda"
-    )
-    expert_ids_triton = torch.empty(
-        (max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda"
-    )
-    num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda")
-
-    sorted_ids_vllm = torch.empty_like(sorted_ids_triton)
-    expert_ids_vllm = torch.empty_like(expert_ids_triton)
-    num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton)
-
-    # 2. run implementations
-    moe_align_block_size_triton(
-        topk_ids,
-        num_experts,
-        block_size,
-        sorted_ids_triton,
-        expert_ids_triton,
-        num_tokens_post_pad_triton,
-    )
-
-    ops.moe_align_block_size(
-        topk_ids,
-        num_experts,
-        block_size,
-        sorted_ids_vllm,
-        expert_ids_vllm,
-        num_tokens_post_pad_vllm,
-    )
-    print(f"✅ VLLM implementation works with {num_experts} experts!")
-
-    # 3. compare results
-    if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose(
-        num_tokens_post_pad_triton, num_tokens_post_pad_vllm
-    ):
-        print("✅ Triton and VLLM implementations match.")
-    else:
-        print("❌ Triton and VLLM implementations DO NOT match.")
-        print("Triton expert_ids:", expert_ids_triton)
-        print("VLLM expert_ids:", expert_ids_vllm)
-        print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton)
-        print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm)
-
-
 # test configurations
 num_tokens_range = [1, 16, 256, 4096]
 num_experts_range = [16, 64, 224, 256, 280, 512]
@@ -87,8 +32,8 @@ configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range
         x_names=["num_tokens", "num_experts", "topk"],
         x_vals=configs,
         line_arg="provider",
-        line_vals=["vllm", "triton"],  # "triton"
-        line_names=["VLLM", "Triton"],  # "Triton"
+        line_vals=["vllm"],
+        line_names=["vLLM"],
         plot_name="moe-align-block-size-performance",
         args={},
     )
@@ -98,36 +43,11 @@ def benchmark(num_tokens, num_experts, topk, provider):
     block_size = 256
     topk_ids = get_topk_ids(num_tokens, num_experts, topk)
 
-    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
-    sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
-    max_num_m_blocks = max_num_tokens_padded // block_size
-    expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda")
-    num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda")
-
     quantiles = [0.5, 0.2, 0.8]
 
     if provider == "vllm":
         ms, min_ms, max_ms = triton.testing.do_bench(
-            lambda: ops.moe_align_block_size(
-                topk_ids,
-                num_experts,
-                block_size,
-                sorted_ids.clone(),
-                expert_ids.clone(),
-                num_tokens_post_pad.clone(),
-            ),
-            quantiles=quantiles,
-        )
-    elif provider == "triton":
-        ms, min_ms, max_ms = triton.testing.do_bench(
-            lambda: moe_align_block_size_triton(
-                topk_ids,
-                num_experts,
-                block_size,
-                sorted_ids.clone(),
-                expert_ids.clone(),
-                num_tokens_post_pad.clone(),
-            ),
+            lambda: moe_align_block_size(topk_ids, block_size, num_experts),
             quantiles=quantiles,
         )
 
@@ -151,6 +71,4 @@ if __name__ == "__main__":
     )
     args = parser.parse_args()
 
-    print("Running correctness check...")
-    check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk)
     benchmark.run(print_data=True, show_plots=True)
diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
index 2c9ad509fa98e..c7d7126bab3ad 100644
--- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
+++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
@@ -5,144 +5,8 @@ from typing import Optional
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.triton_utils import tl, triton
-from vllm.utils import cdiv, round_up
-
-
-@triton.jit
-def moe_align_block_size_stage1(
-    topk_ids_ptr,
-    tokens_cnts_ptr,
-    num_experts: tl.constexpr,
-    numel: tl.constexpr,
-    tokens_per_thread: tl.constexpr,
-):
-    pid = tl.program_id(0)
-
-    start_idx = pid * tokens_per_thread
-
-    off_c = (pid + 1) * num_experts
-
-    for i in range(tokens_per_thread):
-        if start_idx + i < numel:
-            idx = tl.load(topk_ids_ptr + start_idx + i)
-            token_cnt = tl.load(tokens_cnts_ptr + off_c + idx)
-            tl.store(tokens_cnts_ptr + off_c + idx, token_cnt + 1)
-
-
-@triton.jit
-def moe_align_block_size_stage2(
-    tokens_cnts_ptr,
-    num_experts: tl.constexpr,
-):
-    pid = tl.program_id(0)
-
-    last_cnt = 0
-    for i in range(1, num_experts + 1):
-        token_cnt = tl.load(tokens_cnts_ptr + i * num_experts + pid)
-        last_cnt = last_cnt + token_cnt
-        tl.store(tokens_cnts_ptr + i * num_experts + pid, last_cnt)
-
-
-@triton.jit
-def moe_align_block_size_stage3(
-    total_tokens_post_pad_ptr,
-    tokens_cnts_ptr,
-    cumsum_ptr,
-    num_experts: tl.constexpr,
-    block_size: tl.constexpr,
-):
-    last_cumsum = 0
-    off_cnt = num_experts * num_experts
-    for i in range(1, num_experts + 1):
-        token_cnt = tl.load(tokens_cnts_ptr + off_cnt + i - 1)
-        last_cumsum = last_cumsum + tl.cdiv(token_cnt, block_size) * block_size
-        tl.store(cumsum_ptr + i, last_cumsum)
-    tl.store(total_tokens_post_pad_ptr, last_cumsum)
-
-
-@triton.jit
-def moe_align_block_size_stage4(
-    topk_ids_ptr,
-    sorted_token_ids_ptr,
-    expert_ids_ptr,
-    tokens_cnts_ptr,
-    cumsum_ptr,
-    num_experts: tl.constexpr,
-    block_size: tl.constexpr,
-    numel: tl.constexpr,
-    tokens_per_thread: tl.constexpr,
-):
-    pid = tl.program_id(0)
-    start_idx = tl.load(cumsum_ptr + pid)
-    end_idx = tl.load(cumsum_ptr + pid + 1)
-
-    for i in range(start_idx, end_idx, block_size):
-        tl.store(expert_ids_ptr + i // block_size, pid)
-
-    start_idx = pid * tokens_per_thread
-    off_t = pid * num_experts
-
-    for i in range(start_idx, tl.minimum(start_idx + tokens_per_thread,
-                                         numel)):
-        expert_id = tl.load(topk_ids_ptr + i)
-        token_cnt = tl.load(tokens_cnts_ptr + off_t + expert_id)
-        rank_post_pad = token_cnt + tl.load(cumsum_ptr + expert_id)
-        tl.store(sorted_token_ids_ptr + rank_post_pad, i)
-        tl.store(tokens_cnts_ptr + off_t + expert_id, token_cnt + 1)
-
-
-# Triton implementation based on:
-# https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0
-def moe_align_block_size_triton(
-    topk_ids: torch.Tensor,
-    num_experts: int,
-    block_size: int,
-    sorted_token_ids: torch.Tensor,
-    expert_ids: torch.Tensor,
-    num_tokens_post_pad: torch.Tensor,
-) -> None:
-    numel = topk_ids.numel()
-    grid = (num_experts, )
-    tokens_cnts = torch.zeros((num_experts + 1, num_experts),
-                              dtype=torch.int32,
-                              device=topk_ids.device)
-    cumsum = torch.zeros((num_experts + 1, ),
-                         dtype=torch.int32,
-                         device=topk_ids.device)
-    tokens_per_thread = cdiv(numel, num_experts)
-    sorted_token_ids.fill_(numel)
-    expert_ids.zero_()
-
-    moe_align_block_size_stage1[grid](
-        topk_ids,
-        tokens_cnts,
-        num_experts,
-        numel,
-        tokens_per_thread,
-    )
-    moe_align_block_size_stage2[grid](
-        tokens_cnts,
-        num_experts,
-    )
-    moe_align_block_size_stage3[(1, )](
-        num_tokens_post_pad,
-        tokens_cnts,
-        cumsum,
-        num_experts,
-        block_size,
-    )
-    moe_align_block_size_stage4[grid](
-        topk_ids,
-        sorted_token_ids,
-        expert_ids,
-        tokens_cnts,
-        cumsum,
-        num_experts,
-        block_size,
-        numel,
-        tokens_per_thread,
-    )
+from vllm.triton_utils import triton
+from vllm.utils import round_up
 
 
 def moe_align_block_size(

From 9094d11c5d74db904dff9e9b1c63da5d73707eeb Mon Sep 17 00:00:00 2001
From: Yeju Zhou <codingstream@outlook.com>
Date: Sat, 26 Jul 2025 22:09:57 +0800
Subject: [PATCH 38/57] [Bugfix][Apple Silicon] fix missing symbols when build
 from source on Mac with Apple Silicon (#21380)

Signed-off-by: Yeju Zhou <yejuzhou@outlook.com>
---
 csrc/cpu/torch_bindings.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index f1738aee980b6..b20a054648428 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -151,7 +151,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
 
   // Quantization
-#if defined(__AVX512F__) || defined(__aarch64__)
+#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__))
   at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
 
   // Compute int8 quantized tensor for given scaling factor.

From e7c4f9ee866296f49ac392fc32a662da0204e70c Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <yeq@meta.com>
Date: Sat, 26 Jul 2025 07:10:14 -0700
Subject: [PATCH 39/57] [CI/Build][Doc] Move existing benchmark scripts in
 CI/document/example to vllm bench CLI (#21355)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
---
 .../scripts/run-nightly-benchmarks.sh         | 24 +++----
 .../scripts/run-performance-benchmarks.sh     |  6 +-
 .../scripts/hardware_ci/run-cpu-test.sh       | 10 +--
 .buildkite/scripts/run-benchmarks.sh          |  6 +-
 .buildkite/scripts/tpu/run_bm.sh              |  2 +-
 benchmarks/README.md                          | 66 +++++++++----------
 benchmarks/auto_tune/auto_tune.sh             | 18 ++---
 benchmarks/benchmark_latency.py               |  5 ++
 benchmarks/benchmark_serving.py               |  5 ++
 benchmarks/benchmark_throughput.py            |  5 ++
 docs/contributing/profiling.md                | 10 +--
 docs/design/v1/p2p_nccl_connector.md          | 16 ++---
 .../disagg_example_p2p_nccl_xpyd.sh           | 10 +--
 .../disagg_example_nixl.sh                    |  4 +-
 14 files changed, 101 insertions(+), 86 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
index 4162905bb3cc3..29b8ccbf0a2f7 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -73,7 +73,7 @@ get_current_llm_serving_engine() {
     echo "Container: vllm"
     # move to a completely irrelevant directory, to avoid import vllm from current folder
     export CURRENT_LLM_SERVING_ENGINE=vllm
-    
+
     return
   fi
 }
@@ -227,7 +227,7 @@ run_serving_tests() {
 
       if [[ "$dataset_name" = "sharegpt" ]]; then
 
-        client_command="python3 benchmark_serving.py \
+        client_command="vllm bench serve \
           --backend $backend \
           --tokenizer /tokenizer_cache \
           --model $model \
@@ -248,7 +248,7 @@ run_serving_tests() {
         sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
         sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
 
-        client_command="python3 benchmark_serving.py \
+        client_command="vllm bench serve \
           --backend $backend \
           --tokenizer /tokenizer_cache \
           --model $model \
@@ -267,13 +267,13 @@ run_serving_tests() {
           $client_args"
 
       else
-  
+
         echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
         exit 1
 
       fi
 
-        
+
 
       echo "Running test case $test_name with qps $qps"
       echo "Client command: $client_command"
@@ -304,7 +304,7 @@ run_serving_tests() {
 }
 
 run_genai_perf_tests() {
-  # run genai-perf tests 
+  # run genai-perf tests
 
   # $1: a json file specifying genai-perf test cases
   local genai_perf_test_file
@@ -313,14 +313,14 @@ run_genai_perf_tests() {
   # Iterate over genai-perf tests
   jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
     # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')    
-    
+    test_name=$(echo "$params" | jq -r '.test_name')
+
     # if TEST_SELECTOR is set, only run the test cases that match the selector
     if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
       echo "Skip test case $test_name."
       continue
     fi
-    
+
     # prepend the current serving engine to the test name
     test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
 
@@ -371,10 +371,10 @@ run_genai_perf_tests() {
         qps=$num_prompts
         echo "now qps is $qps"
       fi
-    
+
       new_test_name=$test_name"_qps_"$qps
       backend=$CURRENT_LLM_SERVING_ENGINE
-      
+
       if [[ "$backend" == *"vllm"* ]]; then
         backend="vllm"
       fi
@@ -415,7 +415,7 @@ prepare_dataset() {
   do
     cat sonnet.txt >> sonnet_4x.txt
   done
-  
+
 }
 
 main() {
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index 630943c80c4a7..476a37b33a110 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -206,7 +206,7 @@ run_latency_tests() {
       fi
     fi
 
-    latency_command=" $latency_envs python3 benchmark_latency.py \
+    latency_command=" $latency_envs vllm bench latency \
       --output-json $RESULTS_FOLDER/${test_name}.json \
       $latency_args"
 
@@ -273,7 +273,7 @@ run_throughput_tests() {
       fi
     fi
 
-    throughput_command=" $throughput_envs python3 benchmark_throughput.py \
+    throughput_command=" $throughput_envs vllm bench throughput \
       --output-json $RESULTS_FOLDER/${test_name}.json \
       $throughput_args"
 
@@ -394,7 +394,7 @@ run_serving_tests() {
 
       # pass the tensor parallel size to the client so that it can be displayed
       # on the benchmark dashboard
-      client_command="python3 benchmark_serving.py \
+      client_command="vllm bench serve \
         --save-result \
         --result-dir $RESULTS_FOLDER \
         --result-filename ${new_test_name}.json \
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 90cc9c8446223..7c7dbb461ce0d 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -13,9 +13,9 @@ NUMA_NODE=${NUMA_NODE:-1}
 export CMAKE_BUILD_PARALLEL_LEVEL=32
 
 # Setup cleanup
-remove_docker_container() { 
-    set -e; 
-    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; 
+remove_docker_container() {
+    set -e;
+    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
 }
 trap remove_docker_container EXIT
 remove_docker_container
@@ -69,7 +69,7 @@ function cpu_tests() {
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" 
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
 
   # Note: disable it until supports V1
   # Run AWQ test
@@ -83,7 +83,7 @@ function cpu_tests() {
     set -e
     VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
     timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
-    python3 benchmarks/benchmark_serving.py \
+    vllm bench serve \
       --backend vllm \
       --dataset-name random \
       --model meta-llama/Llama-3.2-3B-Instruct \
diff --git a/.buildkite/scripts/run-benchmarks.sh b/.buildkite/scripts/run-benchmarks.sh
index 195a8063fd743..72812218cb668 100644
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@@ -11,10 +11,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.."
 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
 
 # run python-based benchmarks and upload the result to buildkite
-python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
+vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
 bench_latency_exit_code=$?
 
-python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
+vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
 bench_throughput_exit_code=$?
 
 # run server-based benchmarks and upload the result to buildkite
@@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
 
 # wait for server to start, timeout after 600 seconds
 timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
-python3 benchmarks/benchmark_serving.py \
+vllm bench serve \
     --backend vllm \
     --dataset-name sharegpt \
     --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
diff --git a/.buildkite/scripts/tpu/run_bm.sh b/.buildkite/scripts/tpu/run_bm.sh
index 877669cd956ac..beecaf7a740ae 100755
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@@ -77,7 +77,7 @@ done
 echo "run benchmark test..."
 echo "logging to $BM_LOG"
 echo
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
     --backend vllm \
     --model $MODEL  \
     --dataset-name sonnet \
diff --git a/benchmarks/README.md b/benchmarks/README.md
index fb8690d42db98..3b10963c3e014 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -98,7 +98,7 @@ Then run the benchmarking script
 ```bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
   --backend vllm \
   --model NousResearch/Hermes-3-Llama-3.1-8B \
   --endpoint /v1/completions \
@@ -111,25 +111,25 @@ If successful, you will see the following output
 
 ```
 ============ Serving Benchmark Result ============
-Successful requests:                     10        
-Benchmark duration (s):                  5.78      
-Total input tokens:                      1369      
-Total generated tokens:                  2212      
-Request throughput (req/s):              1.73      
-Output token throughput (tok/s):         382.89    
-Total Token throughput (tok/s):          619.85    
+Successful requests:                     10
+Benchmark duration (s):                  5.78
+Total input tokens:                      1369
+Total generated tokens:                  2212
+Request throughput (req/s):              1.73
+Output token throughput (tok/s):         382.89
+Total Token throughput (tok/s):          619.85
 ---------------Time to First Token----------------
-Mean TTFT (ms):                          71.54     
-Median TTFT (ms):                        73.88     
-P99 TTFT (ms):                           79.49     
+Mean TTFT (ms):                          71.54
+Median TTFT (ms):                        73.88
+P99 TTFT (ms):                           79.49
 -----Time per Output Token (excl. 1st token)------
-Mean TPOT (ms):                          7.91      
-Median TPOT (ms):                        7.96      
-P99 TPOT (ms):                           8.03      
+Mean TPOT (ms):                          7.91
+Median TPOT (ms):                        7.96
+P99 TPOT (ms):                           8.03
 ---------------Inter-token Latency----------------
-Mean ITL (ms):                           7.74      
-Median ITL (ms):                         7.70      
-P99 ITL (ms):                            8.39      
+Mean ITL (ms):                           7.74
+Median ITL (ms):                         7.70
+P99 ITL (ms):                            8.39
 ==================================================
 ```
 
@@ -141,7 +141,7 @@ If the dataset you want to benchmark is not supported yet in vLLM, even then you
 {"prompt": "What is the capital of India?"}
 {"prompt": "What is the capital of Iran?"}
 {"prompt": "What is the capital of China?"}
-``` 
+```
 
 ```bash
 # start server
@@ -150,7 +150,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
 
 ```bash
 # run benchmarking script
-python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
+vllm bench serve --port 9001 --save-result --save-detailed \
   --backend vllm \
   --model meta-llama/Llama-3.1-8B-Instruct \
   --endpoint /v1/completions \
@@ -174,7 +174,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 ```
 
 ```bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
   --backend openai-chat \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
@@ -194,7 +194,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
 ```
 
 ``` bash
-python3 benchmarks/benchmark_serving.py \
+vllm bench serve \
     --model meta-llama/Meta-Llama-3-8B-Instruct \
     --dataset-name hf \
     --dataset-path likaixin/InstructCoder \
@@ -210,7 +210,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 **`lmms-lab/LLaVA-OneVision-Data`**
 
 ```bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
   --backend openai-chat \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
@@ -224,7 +224,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
 **`Aeala/ShareGPT_Vicuna_unfiltered`**
 
 ```bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
   --backend openai-chat \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
@@ -237,7 +237,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
 **`AI-MO/aimo-validation-aime`**
 
 ``` bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
     --model Qwen/QwQ-32B \
     --dataset-name hf \
     --dataset-path AI-MO/aimo-validation-aime \
@@ -248,7 +248,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
 **`philschmid/mt-bench`**
 
 ``` bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
     --model Qwen/QwQ-32B \
     --dataset-name hf \
     --dataset-path philschmid/mt-bench \
@@ -261,7 +261,7 @@ When using OpenAI-compatible backends such as `vllm`, optional sampling
 parameters can be specified. Example client command:
 
 ```bash
-python3 vllm/benchmarks/benchmark_serving.py \
+vllm bench serve \
   --backend vllm \
   --model NousResearch/Hermes-3-Llama-3.1-8B \
   --endpoint /v1/completions \
@@ -296,7 +296,7 @@ The following arguments can be used to control the ramp-up:
 <br/>
 
 ```bash
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
   --model NousResearch/Hermes-3-Llama-3.1-8B \
   --dataset-name sonnet \
   --dataset-path vllm/benchmarks/sonnet.txt \
@@ -314,7 +314,7 @@ Total num output tokens:  1500
 **VisionArena Benchmark for Vision Language Models**
 
 ``` bash
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --backend vllm-chat \
   --dataset-name hf \
@@ -336,7 +336,7 @@ Total num output tokens:  1280
 ``` bash
 VLLM_WORKER_MULTIPROC_METHOD=spawn \
 VLLM_USE_V1=1 \
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
     --dataset-name=hf \
     --dataset-path=likaixin/InstructCoder \
     --model=meta-llama/Meta-Llama-3-8B-Instruct \
@@ -360,7 +360,7 @@ Total num output tokens:  204800
 **`lmms-lab/LLaVA-OneVision-Data`**
 
 ```bash
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --backend vllm-chat \
   --dataset-name hf \
@@ -373,7 +373,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
 **`Aeala/ShareGPT_Vicuna_unfiltered`**
 
 ```bash
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --backend vllm-chat \
   --dataset-name hf \
@@ -385,7 +385,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
 **`AI-MO/aimo-validation-aime`**
 
 ```bash
-python3 benchmarks/benchmark_throughput.py \
+vllm bench throughput \
   --model Qwen/QwQ-32B \
   --backend vllm \
   --dataset-name hf \
@@ -399,7 +399,7 @@ python3 benchmarks/benchmark_throughput.py \
 ``` bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-python3 vllm/benchmarks/benchmark_throughput.py \
+vllm bench throughput \
   --model meta-llama/Llama-2-7b-hf \
   --backend vllm \
   --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
index 8d3e1d4bee352..3cd8580e065dd 100644
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# This script aims to tune the best server parameter combinations to maximize throughput for given requirement. 
+# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
 # See details in README (benchmarks/auto_tune/README.md).
 
 TAG=$(date +"%Y_%m_%d_%H_%M")
@@ -56,7 +56,7 @@ start_server() {
     local max_num_batched_tokens=$3
     local vllm_log=$4
     local profile_dir=$5
-    
+
     pkill -f vllm
 
     VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
@@ -73,9 +73,9 @@ start_server() {
 
     # wait for 10 minutes...
     server_started=0
-    for i in {1..60}; do  
+    for i in {1..60}; do
         RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
-        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) 
+        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
         if [[ "$STATUS_CODE" -eq 200 ]]; then
             server_started=1
             break
@@ -98,10 +98,10 @@ update_best_profile() {
     selected_profile_file=
     if [[ "$SYSTEM" == "TPU" ]]; then
         selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
-    fi 
+    fi
     if [[ "$SYSTEM" == "GPU" ]]; then
         selected_profile_file="${sorted_paths[$profile_index]}"
-    fi 
+    fi
     rm -f $PROFILE_PATH/*
     cp $selected_profile_file $PROFILE_PATH
 }
@@ -129,14 +129,14 @@ run_benchmark() {
         echo "server started."
     fi
     echo
-    
+
     echo "run benchmark test..."
     meet_latency_requirement=0
     # get a basic qps by using request-rate inf
     bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
     prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
 adjusted_input_len=$(( INPUT_LEN - prefix_len ))
-    python3 benchmarks/benchmark_serving.py \
+    vllm bench serve \
         --backend vllm \
         --model $MODEL  \
         --dataset-name random \
@@ -169,7 +169,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
             curl -X POST http://0.0.0.0:8004/reset_prefix_cache
             sleep 5
             bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
-            python3 benchmarks/benchmark_serving.py \
+            vllm bench serve \
                 --backend vllm \
                 --model $MODEL  \
                 --dataset-name random \
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 4d2ea126b24a5..d8b960edaa468 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -11,6 +11,7 @@ from typing import Any, Optional
 
 import numpy as np
 from tqdm import tqdm
+from typing_extensions import deprecated
 
 import vllm.envs as envs
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
@@ -34,6 +35,10 @@ def save_to_pytorch_benchmark_format(
         write_to_json(pt_file, pt_records)
 
 
+@deprecated(
+    "benchmark_latency.py is deprecated and will be removed in a "
+    "future version. Please use 'vllm bench latency' instead.",
+)
 def main(args: argparse.Namespace):
     print(args)
 
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index c597fb1068aba..a97fa280f37c0 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -38,6 +38,7 @@ from typing import Any, Literal, Optional
 import numpy as np
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
+from typing_extensions import deprecated
 
 from backend_request_func import (
     ASYNC_REQUEST_FUNCS,
@@ -593,6 +594,10 @@ def save_to_pytorch_benchmark_format(
         write_to_json(pt_file, pt_records)
 
 
+@deprecated(
+    "benchmark_serving.py is deprecated and will be removed in a future "
+    "version. Please use 'vllm bench serve' instead.",
+)
 def main(args: argparse.Namespace):
     print(args)
     random.seed(args.seed)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index c0a7f1d582505..c51b579686529 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -15,6 +15,7 @@ import torch
 import uvloop
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
+from typing_extensions import deprecated
 
 from benchmark_dataset import (
     AIMODataset,
@@ -382,6 +383,10 @@ def get_requests(args, tokenizer):
     return dataset_cls(**common_kwargs).sample(**sample_kwargs)
 
 
+@deprecated(
+    "benchmark_throughput.py is deprecated and will be removed in a "
+    "future version. Please use 'vllm bench throughput' instead.",
+)
 def main(args: argparse.Namespace):
     if args.seed is None:
         args.seed = 0
diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
index a5851cfe963d2..aa3de617e072c 100644
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -38,7 +38,7 @@ VLLM_TORCH_PROFILER_DIR=./vllm_profile \
 benchmark_serving.py:
 
 ```bash
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
     --backend vllm \
     --model meta-llama/Meta-Llama-3-70B \
     --dataset-name sharegpt \
@@ -75,7 +75,7 @@ The following is an example using the `benchmarks/benchmark_latency.py` script:
 nsys profile -o report.nsys-rep \
     --trace-fork-before-exec=true \
     --cuda-graph-trace=node \
-    python benchmarks/benchmark_latency.py \
+vllm bench latency \
     --model meta-llama/Llama-3.1-8B-Instruct \
     --num-iters-warmup 5 \
     --num-iters 1 \
@@ -98,7 +98,7 @@ nsys profile -o report.nsys-rep \
     vllm serve meta-llama/Llama-3.1-8B-Instruct
 
 # client
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
     --backend vllm \
     --model meta-llama/Llama-3.1-8B-Instruct \
     --num-prompts 1 \
@@ -132,7 +132,7 @@ You can view these profiles either as summaries in the CLI, using `nsys stats [p
     ...
     ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
 
-    Time (%)  Total Time (ns)  Instances   Avg (ns)     Med (ns)    Min (ns)  Max (ns)   StdDev (ns)                                                  Name                                                
+    Time (%)  Total Time (ns)  Instances   Avg (ns)     Med (ns)    Min (ns)  Max (ns)   StdDev (ns)                                                  Name
     --------  ---------------  ---------  -----------  -----------  --------  ---------  -----------  ----------------------------------------------------------------------------------------------------
         46.3   10,327,352,338     17,505    589,965.9    144,383.0    27,040  3,126,460    944,263.8  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of…
         14.8    3,305,114,764      5,152    641,520.7    293,408.0   287,296  2,822,716    867,124.9  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of…
@@ -143,7 +143,7 @@ You can view these profiles either as summaries in the CLI, using `nsys stats [p
         2.6      587,283,113     37,824     15,526.7      3,008.0     2,719  2,517,756    139,091.1  std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern…
         1.9      418,362,605     18,912     22,121.5      3,871.0     3,328  2,523,870    175,248.2  void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in…
         0.7      167,083,069     18,880      8,849.7      2,240.0     1,471  2,499,996    101,436.1  void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0…
-    ... 
+    ...
     ```
 
 GUI example:
diff --git a/docs/design/v1/p2p_nccl_connector.md b/docs/design/v1/p2p_nccl_connector.md
index 9f6acf3291dd2..9d334f8873d97 100644
--- a/docs/design/v1/p2p_nccl_connector.md
+++ b/docs/design/v1/p2p_nccl_connector.md
@@ -3,14 +3,14 @@ An implementation of xPyD with dynamic scaling based on point-to-point communica
 # Detailed Design
 
 ## Overall Process
-As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow:  
+As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow:
 
-1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface.  
-2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**.  
-3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**.  
-4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`.  
-5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**.  
-6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**.  
+1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface.
+2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**.
+3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**.
+4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`.
+5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**.
+6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**.
 7. After completing **Decode**, the D instance returns the result to the **Proxy/Router**, which then forwards it to the **client**.
 
 ![image1](https://github.com/user-attachments/assets/fb01bde6-755b-49f7-ad45-48a94b1e10a7)
@@ -291,7 +291,7 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \
 ??? console "Command"
 
     ```shell
-    python3 benchmark_serving.py \
+    vllm bench serve \
         --backend vllm \
         --model base_model \
         --tokenizer meta-llama/Llama-3.1-8B-Instruct \
diff --git a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
index 76f5c0c99d0bb..568f7a43b4962 100644
--- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
@@ -29,7 +29,7 @@ PROXY_PORT=${PROXY_PORT:-30001}
 PREFILL_GPUS=${PREFILL_GPUS:-0}
 DECODE_GPUS=${DECODE_GPUS:-1,2,3}
 PREFILL_PORTS=${PREFILL_PORTS:-20003}
-DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009} 
+DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009}
 
 echo "Warning: P2P NCCL disaggregated prefill XpYd support for vLLM v1 is experimental and subject to change."
 echo ""
@@ -164,7 +164,7 @@ main() {
         local gpu_id=${PREFILL_GPU_ARRAY[$i]}
         local port=${PREFILL_PORT_ARRAY[$i]}
         local kv_port=$((21001 + i))
-        
+
         echo "  Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
         CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \
         --enforce-eager \
@@ -193,7 +193,7 @@ main() {
         local gpu_id=${DECODE_GPU_ARRAY[$i]}
         local port=${DECODE_PORT_ARRAY[$i]}
         local kv_port=$((22001 + i))
-        
+
         echo "  Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
         VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
         --enforce-eager \
@@ -233,7 +233,7 @@ main() {
     # Run Benchmark
     # =============================================================================
     cd ../../../benchmarks/
-    python3 benchmark_serving.py --port 10001 --seed $(date +%s) \
+    vllm bench serve --port 10001 --seed $(date +%s) \
         --model $MODEL \
         --dataset-name random --random-input-len 7500 --random-output-len 200 \
         --num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
@@ -243,4 +243,4 @@ main() {
     cleanup
 }
 
-main
\ No newline at end of file
+main
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
index 0b6c9213ebfff..1178681f1533b 100644
--- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
+++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
@@ -122,7 +122,7 @@ main() {
 
     # begin benchmark
     cd ../../../../benchmarks/
-    python3 benchmark_serving.py --port 9000 --seed $(date +%s) \
+    vllm bench serve --port 9000 --seed $(date +%s) \
         --model meta-llama/Llama-3.1-8B-Instruct \
         --dataset-name random --random-input-len 7500 --random-output-len 200 \
         --num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
@@ -133,4 +133,4 @@ main() {
 
 }
 
-main
\ No newline at end of file
+main

From de509ae8eb4467bd789234d909e5c725b8fba326 Mon Sep 17 00:00:00 2001
From: Kaixi Hou <kaixih@nvidia.com>
Date: Sat, 26 Jul 2025 07:10:36 -0700
Subject: [PATCH 40/57] [NVIDIA] Explicitly disable shuffled weights for
 flashinfer blockscale moe fp8 kernels (#21411)

Signed-off-by: kaixih <kaixih@nvidia.com>
---
 vllm/model_executor/layers/fused_moe/fused_moe.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index c412f695ae766..1985e8612da35 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1127,6 +1127,7 @@ def flashinfer_fused_moe_blockscale_fp8(
         tile_tokens_dim=_get_tile_tokens_dim(x.shape[0], top_k,
                                              global_num_experts),
         routing_method_type=2,  # DeepSeek-styled routing method
+        use_shuffled_weight=False,
     )
 
 

From 6c66f28fa5dc88ce6f7ab30dfa733f9ddb927d3c Mon Sep 17 00:00:00 2001
From: Wenchen Lo <charles761013@gmail.com>
Date: Sat, 26 Jul 2025 16:20:29 -0700
Subject: [PATCH 41/57] Remove xformers requirement for Mistral-format Pixtral
 and Mistral3 (#21154)

Signed-off-by: Wenchen Lo <charles761013@gmail.com>
---
 vllm/model_executor/models/pixtral.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 325a264a2f4c4..41eaf372785eb 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -671,7 +671,19 @@ class Attention(nn.Module):
         v = v.reshape(batch, patches, self.n_heads, self.head_dim)
 
         q, k = apply_rotary_emb_vit(q, k, freqs_cis=freqs_cis)
-        out = xops.memory_efficient_attention(q, k, v, attn_bias=mask)
+
+        if USE_XFORMERS_OPS:
+            out = xops.memory_efficient_attention(q, k, v, attn_bias=mask)
+        else:
+            q = q.transpose(1, 2)
+            k = k.transpose(1, 2)
+            v = v.transpose(1, 2)
+            out = nn.functional.scaled_dot_product_attention(q,
+                                                             k,
+                                                             v,
+                                                             attn_mask=mask)
+            out = out.transpose(1, 2)
+
         out = out.reshape(batch, patches, self.n_heads * self.head_dim)
         return self.wo(out)
 
@@ -814,8 +826,11 @@ class VisionTransformer(nn.Module):
             mask = xops.fmha.attn_bias.BlockDiagonalMask.from_seqlens(
                 [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], )
         else:
-            raise ImportError("Xformers is required for Pixtral inference "
-                              "with the Mistral format")
+            from transformers.models.pixtral.modeling_pixtral import (
+                generate_block_attention_mask)
+            mask = generate_block_attention_mask(
+                [p.shape[-2] * p.shape[-1] for p in patch_embeds_list],
+                patch_embeds)
         out = self.transformer(patch_embeds, mask=mask, freqs_cis=freqs_cis)
 
         # squeeze dim 0 and split into separate tensors for each image

From c6573698414c88990c3ddcd5386d26fb5911c59d Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Sun, 27 Jul 2025 07:54:32 +0800
Subject: [PATCH 42/57] support `torch.compile` for bailing moe (#21664)

---
 vllm/model_executor/models/bailing_moe.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py
index 853c13b135eac..23cab3509ca82 100644
--- a/vllm/model_executor/models/bailing_moe.py
+++ b/vllm/model_executor/models/bailing_moe.py
@@ -32,6 +32,7 @@ from torch import nn
 from transformers.configuration_utils import PretrainedConfig
 
 from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
@@ -291,6 +292,7 @@ class BailingMoeBlock(nn.Module):
         return hidden_states, residual
 
 
+@support_torch_compile
 class BailingMoeModel(nn.Module):
 
     def __init__(

From ccf27cc4d4645a1382c7742bb6bddedb7dc50fea Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Sat, 26 Jul 2025 19:33:52 -0700
Subject: [PATCH 43/57] Migrate Blip2ImagePixelInputs and
 Blip2ImageEmbeddingInputs to TensorSchema (#21656)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 vllm/model_executor/models/blip2.py | 78 ++++++++++++-----------------
 1 file changed, 33 insertions(+), 45 deletions(-)

diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 27a9208107871..8e3505f872eb2 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, TypedDict, Union
+from typing import Annotated, Literal, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -22,6 +22,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptInsertion, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .blip import BlipVisionModel
 from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
@@ -34,19 +35,27 @@ from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
 _IMAGE_TOKEN_ID = 50265
 
 
-class Blip2ImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    data: torch.Tensor
-    """Shape: `(batch_size * num_images, num_channels, height, width)`"""
-
-
-class Blip2ImageEmbeddingInputs(TypedDict):
-    type: Literal["image_embeds"]
-    data: torch.Tensor
-    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
-
-    `hidden_size` must match the hidden size of language model backbone.
+class Blip2ImagePixelInputs(TensorSchema):
     """
+    Dimensions:
+        - bn: Batch size * number of images
+        - c: Number of channels (3)
+        - h: Height of each image
+        - w: Width of each image
+    """
+    type: Literal["pixel_values"]
+    data: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")]
+
+
+class Blip2ImageEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - f: Image feature size
+        - h: Hidden size (must match the hidden size of language model backbone)
+    """
+    type: Literal["image_embeds"]
+    data: Annotated[torch.Tensor, TensorShape("bn", "f", "h")]
 
 
 Blip2ImageInputs = Union[Blip2ImagePixelInputs, Blip2ImageEmbeddingInputs]
@@ -551,21 +560,8 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-        h = w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)
-        actual_dims = tuple(data.shape[1:])
-
-        if actual_dims != expected_dims:
-            expected_expr = ("batch_size", *map(str, expected_dims))
-            raise ValueError(
-                f"The expected shape of pixel values is {expected_expr}. "
-                f"You supplied {tuple(data.shape)}.")
-
-        return data
-
-    def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[Blip2ImageInputs]:
+    def _create_image_input(self,
+                            **kwargs: object) -> Optional[Blip2ImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
@@ -573,27 +569,19 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
             return None
 
         if pixel_values is not None:
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
-            pixel_values = flatten_bn(pixel_values, concat=True)
-
-            return Blip2ImagePixelInputs(
-                type="pixel_values",
-                data=self._validate_pixel_values(pixel_values),
-            )
+            expected_h = expected_w = self.config.vision_config.image_size
+            return Blip2ImagePixelInputs(type="pixel_values",
+                                         data=flatten_bn(pixel_values,
+                                                         concat=True),
+                                         resolve_bindings={
+                                             "h": expected_h,
+                                             "w": expected_w
+                                         })
 
         if image_embeds is not None:
-            if not isinstance(image_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
-
-            image_embeds = flatten_bn(image_embeds, concat=True)
-
             return Blip2ImageEmbeddingInputs(
                 type="image_embeds",
-                data=image_embeds,
+                data=flatten_bn(image_embeds, concat=True),
             )
 
         raise AssertionError("This line should be unreachable.")

From 0b8caf909581a2fbd1b825c65478878c3ce3d0b9 Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Sat, 26 Jul 2025 19:34:11 -0700
Subject: [PATCH 44/57] Migrate DeepseekVL2ImageInputs to TensorSchema (#21658)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 vllm/model_executor/models/deepseek_vl2.py | 105 +++++++--------------
 1 file changed, 32 insertions(+), 73 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 0ca6b28073ec8..544de5fe02d35 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -5,7 +5,7 @@
 """Inference-only Deepseek-VL2 model compatible with HuggingFace weights."""
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, TypedDict, Union
+from typing import Annotated, Literal, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -36,6 +36,7 @@ from vllm.transformers_utils.processors.deepseek_vl2 import (
     DeepseekVLV2Processor)
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 from vllm.utils import is_list_of
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
@@ -46,25 +47,30 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
 _IMAGE_TOKEN = "<image>"
 
 
-class DeepseekVL2ImagePixelInputs(TypedDict):
+class DeepseekVL2ImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - c: Number of channels (3)
+        - h: Height of each image
+        - w: Width of each image
+    """
     type: Literal["pixel_values"]
-    data: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    Shape: `(batch_size * num_images, num_channels, height, width)`
-    """
-    images_spatial_crop: torch.Tensor
-    """
-    Shape: `(batch_size * num_images, 2)`
-    """
+    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
+                    TensorShape("bn", 3, "h", "w")]
+    images_spatial_crop: Annotated[torch.Tensor, TensorShape("bn", 2)]
 
 
-class DeepseekVL2VImageEmbeddingInputs(TypedDict):
+class DeepseekVL2VImageEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - f: Image feature size
+        - h: Hidden size (must match language model backbone)
+    """
     type: Literal["image_embeds"]
-    data: Union[torch.Tensor, list[torch.Tensor]]
-    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
-
-    `hidden_size` must match the hidden size of language model backbone.
-    """
+    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
+                    TensorShape("bn", "f", "h")]
 
 
 DeepseekVL2ImageInputs = Union[DeepseekVL2ImagePixelInputs,
@@ -439,46 +445,6 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         model = model.to(dtype=torch.get_default_dtype())
         return model
 
-    def _validate_pixel_values(
-        self, data: Union[torch.Tensor, list[torch.Tensor]]
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
-
-        h = w = self.vision_config.image_size
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape[1:])
-
-            if actual_dims != expected_dims:
-                expected_expr = ("num_patches", *map(str, expected_dims))
-                raise ValueError(
-                    "The expected shape of pixel values per image per batch "
-                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
-    def _validate_images_spatial_crop(
-        self, data: Union[torch.Tensor, list[torch.Tensor]]
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
-        expected_dims = 2
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = d.size(-1)
-
-            if actual_dims != expected_dims:
-                expected_expr = str(expected_dims)
-                raise ValueError(
-                    f"The expected shape of image sizes per image per batch "
-                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[DeepseekVL2ImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
@@ -489,25 +455,18 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
             return None
 
         if pixel_values is not None:
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
-            if not isinstance(images_spatial_crop, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image sizes. "
-                                 f"Got type: {type(images_spatial_crop)}")
-
-            return DeepseekVL2ImagePixelInputs(
-                type="pixel_values",
-                data=self._validate_pixel_values(flatten_bn(pixel_values)),
-                images_spatial_crop=self._validate_images_spatial_crop(
-                    flatten_bn(images_spatial_crop, concat=True)))
+            expected_h = expected_w = self.vision_config.image_size
+            return DeepseekVL2ImagePixelInputs(type="pixel_values",
+                                               data=flatten_bn(pixel_values),
+                                               images_spatial_crop=flatten_bn(
+                                                   images_spatial_crop,
+                                                   concat=True),
+                                               resolve_bindings={
+                                                   "h": expected_h,
+                                                   "w": expected_w,
+                                               })
 
         if image_embeds is not None:
-            if not isinstance(image_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
-
             return DeepseekVL2VImageEmbeddingInputs(
                 type="image_embeds",
                 data=flatten_bn(image_embeds),

From 3339cba3ff4de0d4a516b31076dc73673510c227 Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Sat, 26 Jul 2025 19:34:14 -0700
Subject: [PATCH 45/57] Migrate FuyuImagePatchInputs to TensorSchema (#21662)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 tests/standalone_tests/test_tensor_schema.py | 22 ++++++++
 vllm/model_executor/models/fuyu.py           | 55 +++++++-------------
 vllm/utils/tensor_schema.py                  | 23 +++++---
 3 files changed, 56 insertions(+), 44 deletions(-)

diff --git a/tests/standalone_tests/test_tensor_schema.py b/tests/standalone_tests/test_tensor_schema.py
index c5b77bb09bbba..b276b88fac1f9 100644
--- a/tests/standalone_tests/test_tensor_schema.py
+++ b/tests/standalone_tests/test_tensor_schema.py
@@ -4,6 +4,7 @@
 import pytest
 import torch
 
+from vllm.model_executor.models.fuyu import FuyuImagePatchInputs
 from vllm.model_executor.models.phi3v import Phi3VImagePixelInputs
 
 
@@ -124,3 +125,24 @@ def test_tensor_schema_with_invalid_resolve_binding_dims():
                 "w": 336
             },
         )
+
+
+def test_tensor_schema_with_list_of_symbolic_dim():
+    flat_data = torch.stack([torch.randn(768) for _ in range(3)])  # (bn=3, fn)
+    patches_per_image = [64, 64, 64]  # len = bn = 3
+
+    FuyuImagePatchInputs(
+        flat_data=flat_data,
+        patches_per_image=patches_per_image,
+    )
+
+
+def test_tensor_schema_with_list_of_symbolic_dim_mismatch_in_length():
+    flat_data = torch.stack([torch.randn(768) for _ in range(4)])  # (bn=4, fn)
+    patches_per_image = [64, 64, 64]  # len = 3 ≠ bn
+
+    with pytest.raises(ValueError, match="expected 'bn'=4, got 3"):
+        FuyuImagePatchInputs(
+            flat_data=flat_data,
+            patches_per_image=patches_per_image,
+        )
\ No newline at end of file
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 558d4fbb4de11..4fb571122abbf 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -19,7 +19,7 @@
 """ PyTorch Fuyu model."""
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, TypedDict
+from typing import Annotated, Literal, Optional
 
 import torch
 import torch.nn as nn
@@ -40,6 +40,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix,
@@ -50,18 +51,24 @@ _IMAGE_TOKEN_ID = 71011
 _NEWLINE_TOKEN_ID = 71019
 
 
-class FuyuImagePatchInputs(TypedDict):
-    type: Literal["image_patches"]
-    flat_data: torch.Tensor
+class FuyuImagePatchInputs(TensorSchema):
     """
-    Shape: 
-    `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)`
+    Dimensions:
+        - bn: Batch size * number of images
+        - fn: Num channels * patch_size_x * patch_size_y
     """
 
-    patches_per_image: list[int]
+    type: Literal["image_patches"] = "image_patches"
+
+    flat_data: Annotated[
+        torch.Tensor,
+        TensorShape("bn", "fn"),
+    ]
+
+    patches_per_image: Annotated[list[int], TensorShape("bn")]
     """
     The number of total patches for each image in the batch.
-
+    
     This is used to split the embeddings which has the first two dimensions
     flattened just like `flat_data`.
     """
@@ -297,42 +304,18 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-
-        h = w = self.config.patch_size
-        num_channels = self.config.num_channels
-        expected_dims = num_channels * h * w
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = d.size(-1)
-
-            if actual_dims != expected_dims:
-                expected_expr = str(expected_dims)
-                raise ValueError(
-                    "The expected shape of pixel values per image per batch "
-                    f"per patch is {expected_expr}. "
-                    f"You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data.to(self.vision_embed_tokens.weight.dtype)
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[FuyuImagePatchInputs]:
         image_patches = kwargs.pop("image_patches", None)
         if image_patches is not None:
-            if not isinstance(image_patches, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image patches. "
-                                 f"Got type: {type(image_patches)}")
-
             image_patches_flat = flatten_bn(image_patches)
-
+            flat_data = flatten_bn(image_patches, concat=True).data.to(
+                self.vision_embed_tokens.weight.dtype)
             return FuyuImagePatchInputs(
                 type="image_patches",
-                flat_data=self._validate_pixel_values(
-                    flatten_bn(image_patches_flat, concat=True)),
+                flat_data=flat_data,
                 patches_per_image=[x.size(0) for x in image_patches_flat],
+                resolve_bindings={"fn": self.image_feature_size},
             )
 
         return None
diff --git a/vllm/utils/tensor_schema.py b/vllm/utils/tensor_schema.py
index 485a0a72ddca8..343df71e1058d 100644
--- a/vllm/utils/tensor_schema.py
+++ b/vllm/utils/tensor_schema.py
@@ -86,9 +86,6 @@ class TensorSchema:
             expected_shape: tuple[Union[int, str], ...],
             dynamic_dims: set[str, ...]) -> tuple[int, ...]:
         """Validate a list/tuple of tensors and return the actual shape."""
-        if not value:
-            raise ValueError(f"{field_name} is an empty list")
-
         # Ensure all tensors in the list have the same
         # shape, besides dynamic dimensions
         first = value[0]
@@ -117,6 +114,7 @@ class TensorSchema:
                                                                          int],
                                         dynamic_dims: set[str, ...]) -> None:
         """Validate that the actual tensor shape matches the expected shape."""
+
         if len(actual_shape) != len(expected_shape):
             raise ValueError(f"{field_name} has rank {len(actual_shape)} "
                              f"but expected {len(expected_shape)}")
@@ -160,12 +158,11 @@ class TensorSchema:
                     # Skip validation when Union contains None
                     if type(None) in args:
                         continue
-                # If not optional, raise error
+                # Otherwise field is required, raise error
                 raise ValueError(f"Required field '{field_name}' is missing")
 
             # Field exists, proceed with validation
             value = getattr(self, field_name)
-
             if get_origin(field_type) is not None:
                 args = get_args(field_type)
 
@@ -173,13 +170,23 @@ class TensorSchema:
                     if isinstance(arg, TensorShape):
                         expected_shape = arg.resolve(**self._resolve_bindings)
                         if isinstance(value, (list, tuple)):
-                            actual_shape = self._validate_nested_tensors(
-                                value, field_name, expected_shape,
-                                arg.dynamic_dims)
+                            # list/tuple of Tensors → shape = (len(value), ...)
+                            if value and isinstance(value[0], torch.Tensor):
+                                actual_shape = self._validate_nested_tensors(
+                                    value, field_name, expected_shape,
+                                    arg.dynamic_dims)
+                            elif value:
+                                # list/tuple of scalars → shape = (len(value),)
+                                actual_shape = (len(value), )
+                            else:
+                                raise ValueError(
+                                    f"{field_name} is an empty list")
 
+                        # Tensor → shape = tensor.shape
                         elif isinstance(value, torch.Tensor):
                             actual_shape = value.shape
 
+                        # Otherwise, it's an unsupported type
                         else:
                             type_names = []
                             for arg in args:

From 20950b29fb33465bcc822bb47a193ff0c0ba01bf Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Sat, 26 Jul 2025 19:34:25 -0700
Subject: [PATCH 46/57] Migrate ChameleonImagePixelInputs to TensorSchema
 (#21657)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 vllm/model_executor/models/chameleon.py | 46 +++++++++++--------------
 1 file changed, 20 insertions(+), 26 deletions(-)

diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 74b18df7214b4..8d705f40ce8ff 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -3,7 +3,7 @@
 
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
-from typing import Any, Literal, Optional, TypedDict, Union
+from typing import Annotated, Any, Literal, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -38,6 +38,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
                          SupportsQuant)
@@ -48,10 +49,16 @@ from .utils import (flatten_bn, is_pp_missing_parameter,
 logger = init_logger(__name__)
 
 
-class ChameleonImagePixelInputs(TypedDict):
+class ChameleonImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - c: Number of channels (3)
+        - h: Height of each image
+        - w: Width of each image
+    """
     type: Literal["pixel_values"]
-    data: torch.Tensor
-    """Shape: `(batch_size * num_images, num_channels, height, width)`"""
+    data: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")]
 
 
 class ChameleonProcessingInfo(BaseProcessingInfo):
@@ -962,19 +969,6 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-        vq_config: ChameleonVQVAEConfig = self.config.vq_config
-        expected_dims = (3, vq_config.resolution, vq_config.resolution)
-        actual_dims = tuple(data.shape[1:])
-
-        if actual_dims != expected_dims:
-            expected_expr = ("batch_size", *map(str, expected_dims))
-            raise ValueError(
-                f"The expected shape of pixel values is {expected_expr}. "
-                f"You supplied {tuple(data.shape)}.")
-
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[ChameleonImagePixelInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
@@ -982,16 +976,16 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
         if pixel_values is None:
             return None
 
-        if not isinstance(pixel_values, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of pixel values. "
-                             f"Got type: {type(pixel_values)}")
+        vq_config: ChameleonVQVAEConfig = self.config.vq_config
+        expected_h = expected_w = vq_config.resolution
 
-        pixel_values = flatten_bn(pixel_values, concat=True)
-
-        return ChameleonImagePixelInputs(
-            type="pixel_values",
-            data=self._validate_pixel_values(pixel_values),
-        )
+        return ChameleonImagePixelInputs(type="pixel_values",
+                                         data=flatten_bn(pixel_values,
+                                                         concat=True),
+                                         resolve_bindings={
+                                             "h": expected_h,
+                                             "w": expected_w
+                                         })
 
     def get_language_model(self) -> torch.nn.Module:
         return self.model

From eed2f463b26c0ca65c12be62505a04de99ead456 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 27 Jul 2025 11:07:57 +0800
Subject: [PATCH 47/57] [VLM] Support HF format Phi-4-MM model (#17121)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 docs/models/supported_models.md               |    1 +
 examples/offline_inference/audio_language.py  |   32 +
 examples/offline_inference/vision_language.py |   36 +
 .../vision_language_multi_image.py            |   35 +
 .../generation/test_phi4_multimodal.py        |  252 +++
 .../multimodal/processing/test_common.py      |   34 +-
 tests/models/registry.py                      |    2 +
 vllm/model_executor/models/phi4_multimodal.py | 1455 +++++++++++++++++
 vllm/model_executor/models/registry.py        |    3 +-
 vllm/transformers_utils/tokenizer.py          |    2 +-
 10 files changed, 1847 insertions(+), 5 deletions(-)
 create mode 100644 tests/models/multimodal/generation/test_phi4_multimodal.py
 create mode 100644 vllm/model_executor/models/phi4_multimodal.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 3847fc15119fd..989bd5fff9655 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -614,6 +614,7 @@ Specified using `--task generate`.
 | `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ |
 | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ |
 | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ | ✅︎ |
 | `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ | ✅︎ |
 | `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ |
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 8014cb53f16a8..01d6a188be994 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -190,6 +190,37 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
     )
 
 
+def run_phi4_multimodal(question: str, audio_count: int) -> ModelRequestData:
+    """
+    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
+    show how to process audio inputs.
+    """
+    model_path = snapshot_download(
+        "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
+    )
+    # Since the vision-lora and speech-lora co-exist with the base model,
+    # we have to manually specify the path of the lora weights.
+    speech_lora_path = os.path.join(model_path, "speech-lora")
+    placeholders = "<|audio|>" * audio_count
+
+    prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
+
+    engine_args = EngineArgs(
+        model=model_path,
+        max_model_len=12800,
+        max_num_seqs=2,
+        enable_lora=True,
+        max_lora_rank=320,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompts,
+        lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
+    )
+
+
 # Qwen2-Audio
 def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
     model_name = "Qwen/Qwen2-Audio-7B-Instruct"
@@ -303,6 +334,7 @@ model_example_map = {
     "granite_speech": run_granite_speech,
     "minicpmo": run_minicpmo,
     "phi4_mm": run_phi4mm,
+    "phi4_multimodal": run_phi4_multimodal,
     "qwen2_audio": run_qwen2_audio,
     "qwen2_5_omni": run_qwen2_5_omni,
     "ultravox": run_ultravox,
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 61f5525c6d7e7..8af59110230e1 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -1097,6 +1097,41 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# HF format Phi-4-multimodal-instruct
+def run_phi4_multimodal(questions: list[str], modality: str) -> ModelRequestData:
+    """
+    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
+    show how to process image inputs.
+    """
+    assert modality == "image"
+    model_path = snapshot_download(
+        "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
+    )
+    # Since the vision-lora and speech-lora co-exist with the base model,
+    # we have to manually specify the path of the lora weights.
+    vision_lora_path = os.path.join(model_path, "vision-lora")
+    prompts = [
+        f"<|user|><|image|>{question}<|end|><|assistant|>" for question in questions
+    ]
+    engine_args = EngineArgs(
+        model=model_path,
+        max_model_len=5120,
+        max_num_seqs=2,
+        max_num_batched_tokens=12800,
+        enable_lora=True,
+        max_lora_rank=320,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
+        mm_processor_kwargs={"dynamic_hd": 16},
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
+    )
+
+
 # Pixtral HF-format
 def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1356,6 +1391,7 @@ model_example_map = {
     "paligemma2": run_paligemma2,
     "phi3_v": run_phi3v,
     "phi4_mm": run_phi4mm,
+    "phi4_multimodal": run_phi4_multimodal,
     "pixtral_hf": run_pixtral_hf,
     "qwen_vl": run_qwen_vl,
     "qwen2_vl": run_qwen2_vl,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index e312a0953e9be..dd50f3639709e 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -760,6 +760,40 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_phi4_multimodal(question: str, image_urls: list[str]) -> ModelRequestData:
+    """
+    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
+    show how to process multi images inputs.
+    """
+
+    model_path = snapshot_download(
+        "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
+    )
+    # Since the vision-lora and speech-lora co-exist with the base model,
+    # we have to manually specify the path of the lora weights.
+    vision_lora_path = os.path.join(model_path, "vision-lora")
+    engine_args = EngineArgs(
+        model=model_path,
+        max_model_len=4096,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        enable_lora=True,
+        max_lora_rank=320,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
+        mm_processor_kwargs={"dynamic_hd": 4},
+    )
+
+    placeholders = "<|image|>" * len(image_urls)
+    prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
+    )
+
+
 def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "Qwen/Qwen-VL-Chat"
     engine_args = EngineArgs(
@@ -988,6 +1022,7 @@ model_example_map = {
     "ovis": load_ovis,
     "phi3_v": load_phi3v,
     "phi4_mm": load_phi4mm,
+    "phi4_multimodal": load_phi4_multimodal,
     "pixtral_hf": load_pixtral_hf,
     "qwen_vl_chat": load_qwen_vl_chat,
     "qwen2_vl": load_qwen2_vl,
diff --git a/tests/models/multimodal/generation/test_phi4_multimodal.py b/tests/models/multimodal/generation/test_phi4_multimodal.py
new file mode 100644
index 0000000000000..db8984d8656fc
--- /dev/null
+++ b/tests/models/multimodal/generation/test_phi4_multimodal.py
@@ -0,0 +1,252 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from collections.abc import Sequence
+from typing import Optional
+
+import librosa
+import pytest
+from huggingface_hub import snapshot_download
+
+from vllm.assets.image import ImageAsset
+from vllm.lora.request import LoRARequest
+from vllm.multimodal.image import rescale_image_size
+from vllm.platforms import current_platform
+
+from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
+                          PromptImageInput, VllmRunner)
+from ....utils import large_gpu_test
+from ...utils import check_logprobs_close
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
+    "cherry_blossom":
+    "<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
+})
+HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
+
+model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct",
+                               revision="refs/pr/70")
+# Since the vision-lora and speech-lora co-exist with the base model,
+# we have to manually specify the path of the lora weights.
+vision_lora_path = os.path.join(model_path, "vision-lora")
+speech_question = os.path.join(model_path, "examples",
+                               "what_is_shown_in_this_image.wav")
+models = [model_path]
+
+target_dtype = "half"
+
+# ROCm Triton FA can run into shared memory issues with these models,
+# use other backends in the meantime
+# FIXME (mattwong, gshtrasb, hongxiayan)
+if current_platform.is_rocm():
+    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
+
+
+def run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    inputs: Sequence[tuple[list[str], PromptImageInput,
+                           Optional[PromptAudioInput]]],
+    model: str,
+    *,
+    max_model_len: int,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    mm_limit: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test are from IMAGE_ASSETS.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects
+    and corresponding MultiModalConfig as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(
+            model,
+            task="generate",
+            max_model_len=max_model_len,
+            max_num_seqs=2,
+            dtype=dtype,
+            limit_mm_per_prompt={"image": mm_limit},
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend=distributed_executor_backend,
+            enable_lora=True,
+            max_lora_rank=320,
+            gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
+            enforce_eager=True,
+            trust_remote_code=False,
+    ) as vllm_model:
+        lora_request = LoRARequest("vision", 1, vision_lora_path)
+        vllm_outputs_per_case = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images,
+                                                audios=audios,
+                                                lora_request=lora_request)
+            for prompts, images, audios in inputs
+        ]
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_model.model.load_adapter(
+            vision_lora_path,
+            adapter_name="vision",
+        )
+        hf_processor = hf_model.processor
+        eos_token_id = hf_processor.tokenizer.eos_token_id
+        hf_outputs_per_case = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=images,
+                                                    audios=audios,
+                                                    eos_token_id=eos_token_id)
+            for prompts, images, audios in inputs
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
+                                        vllm_outputs_per_case):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_model_len", [12800])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype: str, max_model_len: int, max_tokens: int,
+                num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+        None,
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_per_image,
+        model,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        # [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_model_len", [25600])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
+                             size_factors, dtype: str, max_model_len: int,
+                             max_tokens: int, num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_case = [
+        (
+            [HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
+            [[rescale_image_size(image, factor) for image in images]
+             for factor in size_factors],
+            None,
+        ),
+    ]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_per_case,
+        model,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=2,
+        tensor_parallel_size=1,
+    )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_model_len", [12800])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
+                              max_model_len: int, max_tokens: int,
+                              num_logprobs: int) -> None:
+
+    # use the example speech question so that the model outputs are reasonable
+    audio = librosa.load(speech_question, sr=16000)
+    image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+
+    inputs_vision_speech = [
+        (
+            ["<|user|><|image|><|audio|><|end|><|assistant|>"],
+            [image],
+            [audio],
+        ),
+    ]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_vision_speech,
+        model,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index c2e9a73fa82f0..03bdafdbb9311 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -41,12 +41,18 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
 
 
 def _test_processing_correctness(
-    model_id: str,
+    model_id_or_arch: str,
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
 ):
-    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    if model_id_or_arch in HF_EXAMPLE_MODELS.get_supported_archs():
+        # Use model architecture to get the default model id
+        model_info = HF_EXAMPLE_MODELS.get_hf_info(model_id_or_arch)
+        model_id = model_info.default
+    else:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch)
+        model_id = model_id_or_arch
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
@@ -58,7 +64,7 @@ def _test_processing_correctness(
         trust_remote_code=model_info.trust_remote_code,
         seed=0,
         dtype="auto",
-        revision=None,
+        revision=model_info.revision,
         hf_overrides=model_info.hf_overrides,
     )
 
@@ -331,6 +337,28 @@ def test_processing_correctness(
     )
 
 
+# Phi4MultimodalForCausalLM share same model repo with original format
+# Phi4MMForCausalLM, so we add it as a separate test case
+# Remove this test after conversion PR merged:
+# https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/70
+@pytest.mark.parametrize("model_arch", ["Phi4MultimodalForCausalLM"])
+@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
+@pytest.mark.parametrize("num_batches", [32])
+@pytest.mark.parametrize("simplify_rate", [1.0])
+def test_processing_correctness_phi4_multimodal(
+    model_arch: str,
+    hit_rate: float,
+    num_batches: int,
+    simplify_rate: float,
+):
+    _test_processing_correctness(
+        model_arch,
+        hit_rate=hit_rate,
+        num_batches=num_batches,
+        simplify_rate=simplify_rate,
+    )
+
+
 def _assert_inputs_equal(
     a: MultiModalInputs,
     b: MultiModalInputs,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 0dc5aec8db12e..0ef2a028b4a1e 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -433,6 +433,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                     "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}),  # noqa: E501
     "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
                                         trust_remote_code=True),
+    "Phi4MultimodalForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",  # noqa: E501
+                                                 revision="refs/pr/70"),
     "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409",  # noqa: E501
                                                        tokenizer_mode="mistral"),
     "QwenVLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen-VL",
diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py
new file mode 100644
index 0000000000000..432b707a61591
--- /dev/null
+++ b/vllm/model_executor/models/phi4_multimodal.py
@@ -0,0 +1,1455 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Any, Literal, Optional, TypedDict, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import (BatchFeature, Phi4MultimodalAudioConfig,
+                          Phi4MultimodalConfig, Phi4MultimodalFeatureExtractor,
+                          Phi4MultimodalImageProcessorFast)
+from transformers import Phi4MultimodalProcessor as Phi4MMProcessor
+from transformers.models.phi4_multimodal.modeling_phi4_multimodal import (
+    Phi4MultimodalAudioConvModule, Phi4MultimodalAudioNemoConvSubsampling,
+    Phi4MultimodalAudioRelativeAttentionBias, adaptive_enc_mask, unfold_tensor)
+
+from vllm.config import VllmConfig
+from vllm.distributed import (divide, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size)
+from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems,
+                                   ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems, MultiModalDataParser)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.utils import is_list_of
+
+from .idefics2_vision_model import Idefics2VisionTransformer
+from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+# <|endoftext10|> (see vocab.json in hf model)
+_IMAGE_PLACEHOLDER_TOKEN_ID = 200010
+# <|endoftext11|>
+_AUDIO_PLACEHOLDER_TOKEN_ID = 200011
+
+_AUDIO_MAX_SOUNDFILE_SIZE = 241_000
+
+
+def _get_padding_size(orig_width: int, orig_height: int, target_height: int,
+                      target_width: int):
+    ratio_width = target_width / orig_width
+    ratio_height = target_height / orig_height
+
+    if ratio_width < ratio_height:
+        padding_width = 0
+        padding_height = target_height - int(orig_height * ratio_width)
+    else:
+        padding_width = target_width - int(orig_width * ratio_height)
+        padding_height = 0
+    return padding_height, padding_width
+
+
+class Phi4MMProjector(nn.Module):
+
+    def __init__(self, input_size: int, hidden_size: int):
+        super().__init__()
+        self.up = ColumnParallelLinear(input_size, hidden_size)
+        self.down = RowParallelLinear(hidden_size, hidden_size)
+        self.act = get_act_fn("gelu")
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.up(x)
+        x = self.act(x)
+        x, _ = self.down(x)
+        return x
+
+
+class Phi4MMImageEmbedding(nn.Module):
+    """Image embedding."""
+
+    def __init__(self, config: Phi4MultimodalConfig):
+        super().__init__()
+        self.config = config
+        self.layer_idx = config.vision_config.feature_layer
+        self.crop_size = config.vision_config.crop_size
+        self.image_dim_out = config.vision_config.hidden_size
+
+        n_patches = (config.vision_config.image_size //
+                     config.vision_config.patch_size)
+        if n_patches % 2 != 0:
+            self.img_processor_padding = nn.ReflectionPad2d((0, 1, 0, 1))
+            n_patches += 1
+        self.num_img_tokens = (n_patches // 2)**2
+
+        num_hidden_layers = (config.vision_config.num_hidden_layers +
+                             self.layer_idx +
+                             1 if self.layer_idx < 0 else self.layer_idx + 1)
+        self.img_processor = Idefics2VisionTransformer(
+            config.vision_config,
+            require_post_norm=False,
+            num_hidden_layers_override=num_hidden_layers)
+        self.image_token_compression = nn.AvgPool2d(kernel_size=2, stride=2)
+        self.img_projection = Phi4MMProjector(self.image_dim_out,
+                                              config.hidden_size)
+        self.global_img_feature_extensor = nn.Parameter(
+            torch.zeros([1, 1, self.image_dim_out]))
+        self.sub_img_feature_extensor = nn.Parameter(
+            torch.zeros([1, 1, 1, self.image_dim_out]))
+
+    def get_img_features(
+        self,
+        img_embeds: torch.FloatTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        img_feature = self.img_processor(img_embeds,
+                                         patch_attention_mask=attention_mask)
+
+        patch_feature = img_feature
+        # reshape to 2D tensor
+        width = int(math.sqrt(patch_feature.size(1)))
+        patch_feature = patch_feature.view(-1, width, width,
+                                           patch_feature.size(-1))
+        # convert to NCHW
+        patch_feature = patch_feature.permute(0, 3, 1, 2)
+        if getattr(self, "img_processor_padding", None) is not None:
+            patch_feature = self.img_processor_padding(patch_feature)
+        patch_feature = self.image_token_compression(patch_feature)
+        # convert to NHWC
+        patch_feature = patch_feature.permute(0, 2, 3, 1)
+        patch_feature = patch_feature.view(
+            -1,
+            patch_feature.size(1) * patch_feature.size(2),
+            patch_feature.size(-1))
+        return patch_feature
+
+    def forward(
+        self,
+        image_pixel_values: torch.FloatTensor,
+        image_sizes: Optional[torch.Tensor] = None,
+        image_attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        image_pixel_values = image_pixel_values.to(
+            self.img_processor.embeddings.patch_embedding.weight.dtype)
+
+        target_device = self.img_projection.up.bias.device
+        target_dtype = self.img_projection.up.bias.dtype
+
+        batch_size = image_pixel_values.shape[0]
+
+        img_features = self.get_img_features(
+            image_pixel_values.flatten(0, 1),
+            attention_mask=image_attention_mask.flatten(0, 1).to(
+                dtype=bool, device=target_device),
+        )
+        base_feat_size = int(np.sqrt(img_features.shape[1]))
+        img_features = img_features.view(batch_size, -1, base_feat_size**2,
+                                         self.image_dim_out)
+        image_sizes = image_sizes.view(-1, 2)
+
+        output_imgs = []
+        for idx in range(batch_size):
+            height, width = image_sizes[idx]
+            height_ratio = height // self.crop_size
+            width_ratio = width // self.crop_size
+            area_ratio = height_ratio * width_ratio
+
+            global_img = img_features[idx, :1]
+            global_img = global_img.reshape(1, base_feat_size, base_feat_size,
+                                            self.image_dim_out).contiguous()
+            temporary_extensor = self.sub_img_feature_extensor.repeat(
+                1, base_feat_size, 1, 1)
+            global_img = torch.cat([global_img, temporary_extensor],
+                                   dim=2).reshape(1, -1, self.image_dim_out)
+
+            sub_img = img_features[idx, 1:]
+            sub_img = sub_img[:area_ratio]
+            sub_img = (sub_img.reshape(
+                height_ratio, width_ratio, base_feat_size, base_feat_size,
+                self.image_dim_out).transpose(1, 2).reshape(
+                    1, height_ratio * base_feat_size,
+                    width_ratio * base_feat_size,
+                    self.image_dim_out).contiguous())
+
+            if image_attention_mask is not None:
+                reshaped_image_attention_mask = (
+                    image_attention_mask[idx, 1:area_ratio + 1,
+                                         0::2, 0::2].reshape(
+                                             height_ratio, width_ratio,
+                                             base_feat_size,
+                                             base_feat_size).transpose(
+                                                 1, 2).reshape(
+                                                     1, height_ratio *
+                                                     base_feat_size,
+                                                     width_ratio *
+                                                     base_feat_size))
+                useful_height = int(
+                    reshaped_image_attention_mask[0, :, 0].sum().item())
+                useful_width = int(
+                    reshaped_image_attention_mask[0, 0, :].sum().item())
+                sub_img = sub_img[:, :useful_height, :useful_width]
+                temporary_extensor = self.sub_img_feature_extensor.repeat(
+                    1, useful_height, 1, 1)
+            else:
+                temporary_extensor = self.sub_img_feature_extensor.repeat(
+                    1, height_ratio * base_feat_size, 1, 1)
+
+            sub_img = torch.cat([sub_img, temporary_extensor],
+                                dim=2).reshape(1, -1, self.image_dim_out)
+
+            # Merge global and sub
+            output_imgs.append(
+                torch.cat(
+                    [sub_img, self.global_img_feature_extensor, global_img],
+                    dim=1))
+
+        img_set_tensor = []
+        for output_img in output_imgs:
+            output_img = output_img.to(device=target_device,
+                                       dtype=target_dtype)
+            img_feature_proj = self.img_projection(output_img)
+            img_set_tensor.append(img_feature_proj.flatten(0, 1))
+
+        return img_set_tensor
+
+
+class Phi4MultimodalAudioMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: Phi4MultimodalAudioConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.act_fn = MulAndSilu()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            config.hidden_size, [config.intermediate_size] * 2,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
+        self.down_proj = RowParallelLinear(config.intermediate_size,
+                                           config.hidden_size,
+                                           bias=True,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.down_proj")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, _ = self.gate_up_proj(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states, _ = self.down_proj(hidden_states)
+        return hidden_states
+
+
+class Phi4MultimodalAudioAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: Phi4MultimodalAudioConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.total_num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.total_num_heads
+        if self.head_dim * self.total_num_heads != self.embed_dim:
+            raise ValueError(
+                "embed_dim must be divisible by num_heads "
+                f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads}).")
+        self.scale = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=self.embed_dim,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.embed_dim,
+            output_size=self.embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.num_heads = divide(self.total_num_heads, self.tp_size)
+
+    def split_attn_mask(self, attention_mask: torch.Tensor) -> torch.Tensor:
+        start_idx = self.num_heads * self.tp_rank
+        end_idx = self.num_heads * (self.tp_rank + 1)
+        return attention_mask[:, start_idx:end_idx]
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv_states, _ = self.qkv_proj(hidden_states)
+        query, key, value = qkv_states.chunk(3, dim=-1)
+
+        bsz, seq_len, _ = query.size()
+        query = query.view(bsz, seq_len, self.num_heads, self.head_dim)
+        key = key.view(bsz, seq_len, self.num_heads, self.head_dim)
+        value = value.view(bsz, seq_len, self.num_heads, self.head_dim)
+        query, key, value = (x.transpose(1, 2) for x in (query, key, value))
+
+        attention_mask = self.split_attn_mask(attention_mask)
+        out = F.scaled_dot_product_attention(
+            query,
+            key,
+            value,
+            scale=self.scale,
+            attn_mask=attention_mask,
+        )
+        out = out.transpose(1, 2).reshape(bsz, seq_len, -1)
+
+        attn_output, _ = self.o_proj(out)
+
+        return attn_output
+
+
+class Phi4MultimodalAudioConformerEncoderLayer(nn.Module):
+
+    def __init__(self, config: Phi4MultimodalAudioConfig):
+        super().__init__()
+
+        self.feed_forward_in = Phi4MultimodalAudioMLP(config)
+        self.self_attn = Phi4MultimodalAudioAttention(config)
+        self.conv = Phi4MultimodalAudioConvModule(config)
+        self.feed_forward_out = Phi4MultimodalAudioMLP(config)
+        self.layer_norm_att = nn.LayerNorm(config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = hidden_states + 0.5 * self.feed_forward_in(hidden_states)
+        hidden_states = self.layer_norm_att(residual)
+
+        hidden_states = residual + self.self_attn(hidden_states,
+                                                  attention_mask)
+        hidden_states = hidden_states + self.conv(hidden_states)
+        hidden_states = hidden_states + 0.5 * self.feed_forward_out(
+            hidden_states)
+
+        out = self.layer_norm(hidden_states)
+
+        return out
+
+
+class Phi4MMAudioMeanVarianceNormLayer(nn.Module):
+    """Mean/variance normalization layer.
+
+    Will subtract mean and multiply input by inverted standard deviation.
+    Typically used as a very first layer in a model.
+
+    Args:
+        input_size: int
+            layer input size.
+    """
+
+    def __init__(self, config: Phi4MultimodalAudioConfig):
+        super().__init__()
+        self.global_mean = nn.Parameter(torch.zeros(config.input_size))
+        self.global_invstd = nn.Parameter(torch.ones(config.input_size))
+
+    def forward(self, input_: torch.Tensor) -> torch.Tensor:
+        """MeanVarianceNormLayer Forward
+
+        Args:
+            input_: torch.Tensor
+                input tensor.
+        """
+        return (input_ - self.global_mean) * self.global_invstd
+
+
+class Phi4MultimodalAudioModel(nn.Module):
+
+    def __init__(self, config: Phi4MultimodalAudioConfig):
+        super().__init__()
+        self.config = config
+
+        self.encoder_embedding = Phi4MMAudioMeanVarianceNormLayer(config)
+        self.embed = Phi4MultimodalAudioNemoConvSubsampling(config)
+        self.relative_attention_bias_layer = (
+            Phi4MultimodalAudioRelativeAttentionBias(config))
+        self.encoders = nn.ModuleList([
+            Phi4MultimodalAudioConformerEncoderLayer(config)
+            for _ in range(config.num_blocks)
+        ])
+
+    def _streaming_mask(
+        self,
+        seq_len: int,
+        batch_size: int,
+        chunk_size: int,
+        left_chunk: int,
+    ):
+        # Create mask matrix for streaming
+        # S stores start index. if chunksize is 18, s is [0,18,36,....]
+        chunk_start_idx = np.arange(0, seq_len, chunk_size)
+
+        enc_streaming_mask = (adaptive_enc_mask(
+            seq_len, chunk_start_idx,
+            left_window=left_chunk).unsqueeze(0).expand([batch_size, -1, -1]))
+        return enc_streaming_mask
+
+    def forward_embeddings(
+        self,
+        hidden_states: torch.Tensor,
+        masks: torch.Tensor,
+    ):
+        """Forwarding the inputs through the top embedding layers"""
+        seq_len = math.ceil(hidden_states.shape[1] /
+                            self.config.time_reduction)
+        if seq_len <= 0:
+            raise ValueError(
+                f"Sequence length after time reduction is invalid: {seq_len}."
+                "Your input feature is too short.")
+
+        batch_size = hidden_states.shape[0]
+
+        enc_streaming_mask = self._streaming_mask(seq_len, batch_size,
+                                                  self.config.chunk_size,
+                                                  self.config.left_chunk)
+        enc_streaming_mask = enc_streaming_mask.to(hidden_states.device)
+
+        hidden_states, masks = self.embed(hidden_states, masks)
+
+        streaming_mask = enc_streaming_mask
+        if streaming_mask is not None and masks is not None:
+            hs_mask = masks & streaming_mask
+        elif masks is not None:
+            hs_mask = masks
+        else:
+            hs_mask = streaming_mask
+
+        return hidden_states, hs_mask, masks
+
+    def calculate_hs_mask(self, hidden_states: torch.Tensor,
+                          device: torch.device, mask: torch.Tensor):
+        max_audio_length = hidden_states.shape[1]
+        batch_size = hidden_states.shape[0]
+        enc_streaming_mask = self._streaming_mask(max_audio_length, batch_size,
+                                                  self.config.chunk_size,
+                                                  self.config.left_chunk)
+        enc_streaming_mask = enc_streaming_mask.to(device)
+        if mask is None:
+            return enc_streaming_mask
+
+        feature_lens = mask.sum(1)
+        padding_length = feature_lens
+        pad_mask = torch.arange(0, max_audio_length, device=device).expand(
+            padding_length.size(0), -1) < padding_length.unsqueeze(1)
+        pad_mask = pad_mask.unsqueeze(1)
+        pad_mask = pad_mask & enc_streaming_mask
+        return pad_mask
+
+    def forward(self,
+                hidden_states: torch.Tensor,
+                mask: Optional[torch.Tensor] = None):
+        hidden_states = self.encoder_embedding(hidden_states)
+        hidden_states, hs_mask, mask = self.forward_embeddings(
+            hidden_states, mask)
+
+        unfolded = False
+        bs, seq_len, _ = hidden_states.shape
+        max_seq_len = 500  # maximum position for absolute positional encoding
+        if seq_len > max_seq_len:
+            # audio sequence is longer than max_seq_len,
+            # unfold it into chunks of max_seq_len
+            unfolded = True
+            # the unfold op will drop residual frames,
+            # pad it to the multiple of max_seq_len
+            if seq_len % max_seq_len > 0:
+                chunk_pad_size = max_seq_len - (seq_len % max_seq_len)
+            else:
+                chunk_pad_size = 0
+            if chunk_pad_size > 0:
+                hidden_states_pad = F.pad(hidden_states,
+                                          (0, 0, 0, chunk_pad_size),
+                                          "constant", 0)
+                hidden_states = hidden_states_pad.to(hidden_states.device)
+
+            hidden_states = unfold_tensor(hidden_states, max_seq_len)
+            masks_unfold = None
+            if mask is not None:
+                # revise hs_mask here because the previous calculated hs_mask
+                # did not consider extra pad
+                subsampled_pad_mask = mask.squeeze(
+                    1)  # [bz, subsampled_unmask_seq_len]
+                extra_padded_subsamlped_pad_mask = F.pad(
+                    subsampled_pad_mask, (0, chunk_pad_size), "constant",
+                    False)  # extra padding to the pad mask
+                extra_padded_subsamlped_pad_mask = (
+                    extra_padded_subsamlped_pad_mask.unsqueeze(-1).float())
+                masks_unfold = unfold_tensor(
+                    extra_padded_subsamlped_pad_mask, max_seq_len
+                )  # unfold the pad mask like we did to the input tensor
+                masks_unfold = masks_unfold.squeeze(
+                    -1).bool()  # unfold op does not support bool tensor
+            hs_mask = self.calculate_hs_mask(
+                hidden_states, hidden_states.device, masks_unfold
+            )  # calculate hs_mask based on the unfolded pad mask
+
+        relative_attention_bias = self.relative_attention_bias_layer(
+            hidden_states)
+        attention_mask = hs_mask.unsqueeze(1) + relative_attention_bias
+
+        for layer in self.encoders:
+            hidden_states = layer(hidden_states, attention_mask)
+
+        if unfolded:
+            embed_dim = hidden_states.shape[-1]
+            hidden_states = hidden_states.reshape(bs, -1, embed_dim)
+            # if we ever padded before unfolding, we need to remove the padding
+            if chunk_pad_size > 0:
+                hidden_states = hidden_states[:, :-chunk_pad_size, :]
+
+        return hidden_states
+
+
+class Phi4MMAudioEmbedding(nn.Module):
+
+    def __init__(self, config: Phi4MultimodalConfig):
+        super().__init__()
+        self.config = config
+        self.layer_idx = config.audio_config.feature_layer
+
+        self.encoder = Phi4MultimodalAudioModel(config.audio_config)
+
+        audio_config = config.audio_config
+        proj_input_size = (audio_config.hidden_size *
+                           audio_config.downsample_rate)
+        self.vision_speech_projection = Phi4MMProjector(
+            proj_input_size, config.hidden_size)
+        self.speech_projection = Phi4MMProjector(proj_input_size,
+                                                 config.hidden_size)
+
+    def get_projection(
+        self,
+        audio_projection_mode: Literal["speech", "vision"],
+    ) -> Phi4MMProjector:
+        if audio_projection_mode == "speech":
+            return self.speech_projection
+        elif audio_projection_mode == "vision":
+            return self.vision_speech_projection
+
+    def forward(
+        self,
+        audio_input_features: torch.FloatTensor,
+        audio_embed_sizes=None,
+        audio_attention_mask=None,
+        audio_projection_mode="speech",
+    ) -> torch.FloatTensor:
+
+        audio_projection = self.get_projection(audio_projection_mode)
+
+        target_device = audio_projection.up.bias.device
+        target_dtype = audio_projection.up.bias.dtype
+
+        audio_input_features = audio_input_features.to(device=target_device,
+                                                       dtype=target_dtype)
+
+        audio_encoder_hidden_states = self.encoder(audio_input_features,
+                                                   audio_attention_mask)
+        audio_embeds = audio_projection(audio_encoder_hidden_states)
+
+        return audio_embeds.flatten(0, 1)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Phi4MMImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    Shape:
+    `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
+
+    Note that `num_patches` may be different per batch and image,
+    in which case the data is passed as a list instead of a batched tensor.
+    """
+
+    image_sizes: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 2)`
+
+    This should be in `(height, width)` format.
+    """
+
+    num_img_tokens: list[int]
+    """Shape: `(batch_size * num_images)`"""
+
+    image_attention_mask: torch.Tensor
+    """Shape: `(batch_size * num_images, H_mask, W_mask)`"""
+
+
+class Phi4MMImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: Union[torch.Tensor, list[torch.Tensor]]
+    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+class Phi4MMAudioFeatureInputs(TypedDict):
+    type: Literal["audio_features"]
+    data: Union[torch.Tensor, list[torch.Tensor]]
+    """Shape: `(batch_size * num_audios, 80, M)"""
+
+
+class Phi4MMAudioEmbeddingInputs(TypedDict):
+    type: Literal["audio_embeds"]
+    data: NestedTensors
+    """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)"""
+
+
+Phi4MMImageInput = Union[Phi4MMImagePixelInputs, Phi4MMImageEmbeddingInputs]
+Phi4MMAudioInputs = Union[Phi4MMAudioFeatureInputs, Phi4MMAudioEmbeddingInputs]
+
+
+def cat_with_pad(tensors, dim, padding_value=0):
+    """
+    cat along dim, while pad to max for all other dims
+    """
+    ndim = tensors[0].dim()
+    assert all(
+        t.dim() == ndim for t in
+        tensors[1:]), "All tensors must have the same number of dimensions"
+
+    out_size = [max(t.shape[i] for t in tensors) for i in range(ndim)]
+    out_size[dim] = sum(t.shape[dim] for t in tensors)
+    output = tensors[0].new_full(out_size, padding_value)
+
+    index = 0
+    for t in tensors:
+        # Create a slice list where every dimension except dim is full slice
+        slices = [slice(0, t.shape[d]) for d in range(ndim)]
+        # Update only the concat dimension slice
+        slices[dim] = slice(index, index + t.shape[dim])
+
+        output[slices] = t
+        index += t.shape[dim]
+
+    return output
+
+
+class Phi4MMProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self) -> Phi4MultimodalConfig:
+        return self.ctx.get_hf_config(Phi4MultimodalConfig)
+
+    def get_hf_processor(
+        self,
+        *,
+        dynamic_hd: Optional[int] = None,
+        **kwargs: object,
+    ) -> Phi4MMProcessor:
+        if dynamic_hd is not None:
+            kwargs["dynamic_hd"] = dynamic_hd
+
+        return self.ctx.get_hf_processor(**kwargs)
+
+    def get_feature_extractor(self) -> Phi4MultimodalFeatureExtractor:
+        return self.get_hf_processor().audio_processor
+
+    def get_image_processor(
+        self,
+        processor: Optional[Phi4MMProcessor] = None,
+    ) -> Phi4MultimodalImageProcessorFast:
+        if processor is None:
+            processor = self.get_hf_processor()
+        return processor.image_processor
+
+    def get_dynamic_hd(
+        self,
+        processor: Optional[Phi4MMProcessor] = None,
+    ) -> int:
+        return self.get_image_processor(processor).dynamic_hd
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"audio": None, "image": None}
+
+    def _find_target_aspect_ratio(
+        self,
+        orig_width: int,
+        orig_height: int,
+        image_size: int,
+        max_num: int,
+        min_num: int,
+    ):
+        w_crop_num = math.ceil(orig_width / float(image_size))
+        h_crop_num = math.ceil(orig_height / float(image_size))
+        if w_crop_num * h_crop_num > max_num:
+            aspect_ratio = orig_width / orig_height
+
+            # calculate the existing image aspect ratio
+            target_ratios = set((i, j) for i in range(1, max_num + 1)
+                                for j in range(1, max_num + 1)
+                                if i * j <= max_num and i * j >= min_num)
+            target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+            # find the closest aspect ratio to the target
+            image_processor = self.get_image_processor()
+            target_aspect_ratio = image_processor.find_closest_aspect_ratio(
+                aspect_ratio,
+                target_ratios,
+                orig_width,
+                orig_height,
+            )
+
+            # calculate the target width and height
+            target_width = image_size * target_aspect_ratio[0]
+            target_height = image_size * target_aspect_ratio[1]
+        else:
+            target_width = image_size * w_crop_num
+            target_height = image_size * h_crop_num
+            target_aspect_ratio = (w_crop_num, h_crop_num)
+        return target_aspect_ratio, target_height, target_width
+
+    def _compute_num_image_tokens(
+        self,
+        orig_width: int,
+        orig_height: int,
+        dynamic_hd_size: int,
+        vit_image_size: int,
+        vit_patch_size: int,
+        token_compression_factor: int = 2,
+    ):
+        """
+        compute the number of tokens an image is expected to take up considering
+        the image encoder architecture and exclude output features containing 
+        only padding pixels
+
+        for siglip, vit_image_size=448, vit_patch_size=14, so output will be 
+        32x32 feature map
+        NOTE right now, Phi4MM uses hard-coded token_compression_factor=2
+        """
+        assert vit_image_size % vit_patch_size == 0, (
+            "vit_image_size must be divisible by vit_patch_size")
+        assert (vit_image_size // vit_patch_size %
+                token_compression_factor == 0), (
+                    "vit_image_size // vit_patch_size must be divisible by "
+                    "token_compression_factor")
+
+        target_aspect_ratio, target_height, target_width = (
+            self._find_target_aspect_ratio(orig_width,
+                                           orig_height,
+                                           vit_image_size,
+                                           dynamic_hd_size,
+                                           min_num=1))
+        assert target_aspect_ratio[0] * vit_image_size == target_width, (
+            f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}")
+        assert target_aspect_ratio[1] * vit_image_size == target_height, (
+            f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}")
+        assert (target_height % vit_image_size == 0
+                and target_width % vit_image_size == 0)
+
+        padding_height, padding_width = _get_padding_size(
+            orig_width, orig_height, target_height, target_width)
+        assert padding_width == 0 or padding_height == 0, \
+            "padding_width or padding_height must be 0"
+
+        target_feat_width = target_width // vit_patch_size
+        target_feat_height = target_height // vit_patch_size
+        if padding_width >= vit_patch_size:
+            assert padding_height == 0, "padding_height not 0"
+            non_pad_feat_width = target_feat_width - math.floor(
+                padding_width / vit_patch_size)
+            non_pad_feat_height = target_feat_height
+        elif padding_height >= vit_patch_size:
+            assert padding_width == 0, "padding_width not 0"
+            non_pad_feat_height = target_feat_height - math.floor(
+                padding_height / vit_patch_size)
+            non_pad_feat_width = target_feat_width
+        else:
+            # small padding shorter than a vit patch
+            non_pad_feat_width = target_feat_width
+            non_pad_feat_height = target_feat_height
+
+        feat_width = non_pad_feat_width // token_compression_factor
+        feat_height = non_pad_feat_height // token_compression_factor
+        # NOTE it's possible that the non-padding feature is not divisible
+        if non_pad_feat_width % token_compression_factor != 0:
+            feat_width += 1
+        if non_pad_feat_height % token_compression_factor != 0:
+            feat_height += 1
+        num_hd_patch_tokens = feat_width * feat_height
+        num_hd_newline_tokens = feat_height
+        vit_feature_size = vit_image_size // vit_patch_size
+        num_global_image_tokens = (vit_feature_size //
+                                   token_compression_factor)**2
+        num_sep_tokens = 1
+        num_global_image_newline_tokens = \
+            vit_feature_size // token_compression_factor
+
+        return (num_global_image_tokens + num_sep_tokens +
+                num_hd_patch_tokens + num_hd_newline_tokens +
+                num_global_image_newline_tokens)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[Phi4MMProcessor] = None,
+    ) -> int:
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+        vit_image_size = vision_config.image_size
+        vit_patch_size = vision_config.patch_size
+
+        dynamic_hd_size = self.get_dynamic_hd(processor=processor)
+
+        # we use default `token_compression_factor=2`,
+        # since it's not in HF vision config.
+        image_num_tokens = self._compute_num_image_tokens(
+            image_width,
+            image_height,
+            dynamic_hd_size=dynamic_hd_size,
+            vit_image_size=vit_image_size,
+            vit_patch_size=vit_patch_size,
+        )
+
+        return image_num_tokens
+
+    def get_image_size_with_most_features(
+        self,
+        processor: Optional[Phi4MMProcessor] = None,
+    ) -> ImageSize:
+        vit_image_size = self.get_hf_config().vision_config.image_size
+
+        max_side = vit_image_size * self.get_dynamic_hd(processor=processor)
+        return ImageSize(height=max_side, width=vit_image_size)
+
+    def get_audio_num_frames(self, audio_len: int, sr: float) -> int:
+        """
+        Compute the output size of the `extract_features` method.
+
+        Args:
+            audio_len (int): Length of the input waveform in samples.
+            sr (float): Sampling rate of the waveform, either 16000 or 8000.
+
+        Returns:
+            tuple (int, int): Output size as (T, D), where:
+                T: Number of time frames.
+                D: Number of Mel filterbank bins (80).
+        """
+
+        # Resample to 16000 or 8000 if needed
+        if sr > 16000:
+            audio_len //= sr // 16000
+        elif 8000 <= sr < 16000:
+            # We'll resample to 16K from 8K
+            audio_len *= 2
+        elif sr < 8000:
+            raise RuntimeError(f"Unsupported sample rate {sr}")
+
+        # Spectrogram parameters for 16 kHz
+        win_length = 400  # Frame length in samples
+        hop_length = 160  # Frame shift in samples
+
+        # Calculate number of frames (T)
+        num_frames = (audio_len - win_length) // hop_length + 1
+        if num_frames < 1:
+            raise ValueError("Waveform too short for given parameters.")
+
+        # Return time frames (T)
+        return num_frames
+
+    def _compute_audio_embed_size(self, audio_frames: int) -> int:
+        """
+        Compute the size of audio embeddings from the number of audio frames.
+        """
+        # `_compute_audio_embed_size` in audio_processor use torch for
+        # computation, therefore we re-implement it to use pythonic
+        # numeric computation to avoid extra tensor conversion.
+        audio_processor = self.get_feature_extractor()
+        audio_compression_rate = audio_processor.audio_compression_rate
+        audio_downsample_rate = audio_processor.audio_downsample_rate
+
+        integer = audio_frames // audio_compression_rate
+        remainder = audio_frames % audio_compression_rate
+        result = integer + int(remainder > 0)
+
+        integer = result // audio_downsample_rate
+        remainder = result % audio_downsample_rate
+        result = integer + int(remainder > 0)  # qformer compression
+
+        return result
+
+
+class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+        num_images = mm_counts.get("image", 0)
+
+        tokenizer = self.info.get_tokenizer()
+        image_tokens: str = tokenizer.image_token * num_images
+        audio_tokens: str = tokenizer.audio_token * num_audios
+
+        return image_tokens + audio_tokens
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_audios = mm_counts.get("audio", 0)
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "audio":
+            self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE,
+                                   num_audios=num_audios),
+        }
+
+        return mm_data
+
+
+class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        feature_extractor = self.info.get_feature_extractor()
+        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data:
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        audio_data = mm_data.pop("audios", [])
+        if audio_data:
+            mm_data['audio'] = audio_data
+
+        processed_outputs = super()._call_hf_processor(prompt, mm_data,
+                                                       mm_kwargs, tok_kwargs)
+
+        if "image_pixel_values" in processed_outputs:
+            num_img_tokens = [
+                self.info.get_num_image_tokens(image_width=img_size[0],
+                                               image_height=img_size[1])
+                for img_size in processed_outputs["image_sizes"]
+            ]
+            processed_outputs["num_img_tokens"] = num_img_tokens
+
+        if audio_data:
+            audio_features = processed_outputs['audio_input_features']
+            sr = self.info.get_feature_extractor().sampling_rate
+            feature_sizes = [
+                self.info.get_audio_num_frames(len(audio), sr)
+                for audio in audio_data
+            ]
+            processed_outputs['audio_input_features'] = [
+                audio_features[idx, :size]
+                for idx, size in enumerate(feature_sizes)
+            ]
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            image_pixel_values=MultiModalFieldConfig.batched("image"),
+            image_attention_mask=MultiModalFieldConfig.batched("image"),
+            image_sizes=MultiModalFieldConfig.batched("image"),
+            num_img_tokens=MultiModalFieldConfig.batched("image"),
+            audio_input_features=MultiModalFieldConfig.batched("audio"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        tokenizer = self.info.get_tokenizer()
+        image_token_id = tokenizer.vocab[tokenizer.image_token]
+        audio_token_id = tokenizer.vocab[tokenizer.audio_token]
+
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        audio_processor = self.info.get_feature_extractor()
+
+        def get_image_replacement_phi4mm(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_image_tokens = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    processor=hf_processor,
+                )
+
+            image_tokens = [image_token_id] * num_image_tokens
+
+            return image_tokens
+
+        def get_audio_replacement_phi4mm(item_idx: int):
+            audios = mm_items.get_items("audio", AudioProcessorItems)
+            # TODO(Isotr0py): support embedding inputs
+            audio_len = audios.get_audio_length(item_idx)
+            audio_frames = self.info.get_audio_num_frames(
+                audio_len, audio_processor.sampling_rate)
+            audio_embed_size = self.info._compute_audio_embed_size(
+                audio_frames)
+
+            audio_tokens = [audio_token_id] * audio_embed_size
+
+            return audio_tokens
+
+        return [
+            PromptReplacement(
+                modality="audio",
+                target=[audio_token_id],
+                replacement=get_audio_replacement_phi4mm,
+            ),
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=get_image_replacement_phi4mm,
+            ),
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Phi4MMMultiModalProcessor,
+    info=Phi4MMProcessingInfo,
+    dummy_inputs=Phi4MMDummyInputsBuilder,
+)
+class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
+    """
+    Implements the Phi-4-multimodal-instruct model in vLLM.
+    """
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "qkv_proj",
+        ],
+        "gate_up_proj": [
+            "gate_up_proj",
+        ],
+    }
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # Multimodal embedding
+            "model.embed_tokens_extend.": "",
+            # LLM backbone
+            "model.": "language_model.model.",
+        },
+        orig_to_new_substr={
+            # projection
+            ".img_projection_": ".img_projection.",
+            ".up_proj_for_speech.": ".speech_projection.up.",
+            ".up_proj_for_vision_speech.": ".vision_speech_projection.up.",
+            ".down_proj_for_speech.": ".speech_projection.down.",
+            ".down_proj_for_vision_speech.": ".vision_speech_projection.down.",
+        },
+    )
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|image|>"
+        if modality.startswith("audio"):
+            return "<|audio|>"
+
+        raise ValueError("Only image or audio modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        # TODO: Optionally initializes these for supporting input embeddings.
+        self.image_embed = Phi4MMImageEmbedding(
+            config,
+            # prefix=maybe_prefix(prefix, "image_embed"),
+        )
+        self.audio_embed = Phi4MMAudioEmbedding(
+            config,
+            # prefix=maybe_prefix(prefix, "audio_embed"),
+        )
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            architectures=["Phi3ForCausalLM"],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _parse_and_validate_audio_input(
+            self, **kwargs: object) -> Optional[Phi4MMAudioInputs]:
+        """
+        Parse and validate the audio input to the model.  This handles both 
+        audio features and audio embeddings, but only the former is used for
+        now.
+
+        Args:
+            kwargs (object): Keyword arguments.
+
+        Returns:
+            Optional[Phi4MMAudioInputs]: Parsed and validated audio inputs.
+        """
+        audio_features = kwargs.pop("audio_input_features", None)
+        audio_embeds = kwargs.pop("audio_embeds", None)
+
+        if audio_features is None and audio_embeds is None:
+            return None
+
+        if audio_features is not None:
+            if not isinstance(audio_features, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of audio features. "
+                                 f"Got type: {type(audio_features)}")
+
+            return Phi4MMAudioFeatureInputs(type="audio_features",
+                                            data=flatten_bn(audio_features))
+
+        if audio_embeds is not None:
+            if not isinstance(audio_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of audio embeds. "
+                                 f"Got type: {type(audio_embeds)}")
+
+            return Phi4MMAudioEmbeddingInputs(type="audio_embeds",
+                                              data=audio_embeds)
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_audio_input(self, audio_input: Phi4MMAudioInputs,
+                             audio_projection_mode: str) -> NestedTensors:
+        """
+        Create the audio embeddings from the audio input, where the audio input
+        is pairs of audio features and audio embed lengths.  The audio input is
+        created by `input_mapper_for_phi4mm_audio`.
+
+        Args:
+            audio_input (Phi4MMAudioInputs): Audio input.
+
+        Returns:
+            NestedTensors: Audio embeddings
+        """
+        if audio_input["type"] == "audio_embeds":
+            return audio_input["data"]
+
+        audio_features = audio_input["data"]
+        # (e.g. multiple examples) and the second dim is the multi-audio dim
+        # (e.g. multiple audios in the same example)
+
+        dtype = next(self.audio_embed.parameters()).dtype
+        audio_embeds = [
+            self.audio_embed(
+                features.unsqueeze(0).to(dtype),
+                audio_projection_mode=audio_projection_mode,
+            ) for features in audio_features
+        ]
+        return audio_embeds
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Phi4MMImagePixelInputs]:
+        image_pixel_values: NestedTensors = kwargs.get("image_pixel_values")
+        if image_pixel_values is None:
+            return None
+
+        image_sizes = kwargs.get("image_sizes")
+        image_attention_mask = kwargs.get("image_attention_mask")
+        num_img_tokens = kwargs.get("num_img_tokens")
+        assert image_sizes is not None and image_attention_mask is not None\
+              and num_img_tokens is not None, "Missing image inputs"
+
+        if is_list_of(image_pixel_values, torch.Tensor):
+            assert all(p.dim() == 5
+                       for p in image_pixel_values), "Incorrect image inputs"
+            # list len is batch_size.
+            # each tensor has dimension: num_img_per_example, num_hd_patches,
+            # channels, height, width.
+            # need to pad along num_hd_patches.
+            # mask size num_img_per_prompt, num_hd_patches, feat_h, heat_w.
+            image_pixel_values = cat_with_pad(image_pixel_values, dim=0)
+        elif isinstance(image_pixel_values, torch.Tensor):
+            # dimension: batch_size, num_img_per_example, num_hd_patches,
+            # channels, height, width.
+            # we flatten first 2 dims to make it a single large batch for
+            # SigLIP Encoder.
+            assert image_pixel_values.dim() == 6, "Incorrect image inputs"
+            image_pixel_values = image_pixel_values.flatten(0, 1)
+        else:
+            raise ValueError("Incorrect image_pixel_values inputs")
+
+        if isinstance(image_attention_mask, list):
+            image_attention_mask = cat_with_pad(image_attention_mask, dim=0)
+        elif isinstance(image_attention_mask, torch.Tensor):
+            image_attention_mask = image_attention_mask.flatten(0, 1)
+        else:
+            raise ValueError("Incorrect image_attention_mask inputs")
+
+        if isinstance(image_sizes, list):
+            image_sizes = torch.cat(image_sizes, dim=0)
+        elif isinstance(image_sizes, torch.Tensor):
+            image_sizes = image_sizes.flatten(0, 1)
+        else:
+            raise ValueError("Incorrect image_attention_mask inputs")
+
+        if isinstance(num_img_tokens, list):
+            num_img_tokens = [
+                n for num_tensor in num_img_tokens
+                for n in num_tensor.tolist()
+            ]
+        elif isinstance(num_img_tokens, torch.Tensor):
+            num_img_tokens = num_img_tokens.flatten(0, 1).tolist()
+        else:
+            raise ValueError("Incorrect image_attention_mask inputs")
+
+        return Phi4MMImagePixelInputs(
+            type="pixel_values",
+            data=image_pixel_values,
+            image_sizes=image_sizes,
+            image_attention_mask=image_attention_mask,
+            num_img_tokens=num_img_tokens,
+        )
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("image_pixel_values",
+                             "image_embeds") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if input_key in ("audio_input_features",
+                             "audio_embeds") and "audios" not in modalities:
+                modalities["audios"] = self._parse_and_validate_audio_input(
+                    **kwargs)
+
+        return modalities
+
+    def _process_image_input(
+            self, image_input: Phi4MMImagePixelInputs) -> list[torch.Tensor]:
+        if image_input["type"] == "image_embeds":
+            image_embeds = image_input["image_embeds"].type(self.visual.dtype)
+        else:
+            dtype = next(self.image_embed.parameters()).dtype
+            pixel_values = image_input['data'].to(dtype)
+            image_sizes = image_input['image_sizes']
+            image_attention_mask = image_input['image_attention_mask']
+            image_embeds = self.image_embed(pixel_values, image_sizes,
+                                            image_attention_mask)
+        return image_embeds
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return None
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        audio_projection_mode = 'speech'
+        for modality in modalities:
+            # make sure process images first
+            if modality == "images":
+                audio_projection_mode = "vision"
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += tuple(vision_embeddings)
+            if modality == "audios":
+                audio_input = modalities["audios"]
+                audio_embeddings = self._process_audio_input(
+                    audio_input, audio_projection_mode=audio_projection_mode)
+                multimodal_embeddings += tuple(audio_embeddings)
+
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                [_IMAGE_PLACEHOLDER_TOKEN_ID, _AUDIO_PLACEHOLDER_TOKEN_ID])
+        return inputs_embeds
+
+    def get_input_embeddings_v0(
+        self,
+        input_ids: torch.Tensor,
+        image_input: Optional[Phi4MMImagePixelInputs] = None,
+        audio_input: Optional[Phi4MMAudioFeatureInputs] = None,
+    ) -> torch.Tensor:
+        audio_projection_mode = 'speech'
+        inputs_embeds = self.get_input_embeddings(input_ids)
+        if image_input is not None:
+            image_embeds = self._process_image_input(image_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                image_embeds,
+                placeholder_token_id=_IMAGE_PLACEHOLDER_TOKEN_ID,
+            )
+            audio_projection_mode = 'vision'
+
+        if audio_input is not None:
+            audio_embeds = self._process_audio_input(
+                audio_input, audio_projection_mode=audio_projection_mode)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                audio_embeds,
+                placeholder_token_id=_AUDIO_PLACEHOLDER_TOKEN_ID,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner from
+        # `get_multimodal_embeddings` and `get_input_embeddings`, this
+        # condition is only for v0 compatibility.
+        elif inputs_embeds is None:
+            image_input = self._parse_and_validate_image_input(**kwargs)
+            audio_input = self._parse_and_validate_audio_input(**kwargs)
+
+            if image_input is None and audio_input is None:
+                inputs_embeds = None
+            else:
+                inputs_embeds = self.get_input_embeddings_v0(
+                    input_ids,
+                    image_input=image_input,
+                    audio_input=audio_input)
+                input_ids = None
+
+        hidden_states = self.language_model(
+            input_ids,
+            positions,
+            intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model.",
+            connector=[
+                "img_projection", "vision_speech_projection",
+                "speech_projection"
+            ],
+            tower_model=["image_embed", "audio_embed"],
+        )
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 709b2eb37a3ed..91d3b1e1033ad 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -223,6 +223,8 @@ _MULTIMODAL_MODELS = {
     "Ovis": ("ovis", "Ovis"),
     "PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
+    "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
+    "Phi4MultimodalForCausalLM": ("phi4_multimodal", "Phi4MultimodalForCausalLM"),  # noqa: E501
     "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"),  # noqa: E501
     "QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"),  # noqa: E501
     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
@@ -231,7 +233,6 @@ _MULTIMODAL_MODELS = {
     "Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
     "Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
     "UltravoxModel": ("ultravox", "UltravoxModel"),
-    "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
     "TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"),  # noqa: E501
     "Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"),  # noqa: E501
     "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"),  # noqa: E501
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 25dd71d877fb1..24ddd35abea60 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -295,7 +295,7 @@ def cached_tokenizer_from_config(
     return cached_get_tokenizer(
         model_config.tokenizer,
         tokenizer_mode=model_config.tokenizer_mode,
-        tokenizer_revision=model_config.tokenizer_revision,
+        revision=model_config.tokenizer_revision,
         trust_remote_code=model_config.trust_remote_code,
         **kwargs,
     )

From 971948b846994819c50cb5c5f04a54822635e303 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Sat, 26 Jul 2025 20:35:22 -0700
Subject: [PATCH 48/57] Handle non-serializable objects in vllm bench (#21665)

---
 vllm/benchmarks/utils.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/benchmarks/utils.py b/vllm/benchmarks/utils.py
index f0bb99326ab40..5f95fdcc75829 100644
--- a/vllm/benchmarks/utils.py
+++ b/vllm/benchmarks/utils.py
@@ -67,4 +67,9 @@ class InfEncoder(json.JSONEncoder):
 
 def write_to_json(filename: str, records: list) -> None:
     with open(filename, "w") as f:
-        json.dump(records, f, cls=InfEncoder)
+        json.dump(
+            records,
+            f,
+            cls=InfEncoder,
+            default=lambda o: f"<{type(o).__name__} is not JSON serializable>",
+        )

From 01a395e9e720ec06db299ddb561da8e00b094182 Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <yeq@meta.com>
Date: Sat, 26 Jul 2025 21:02:12 -0700
Subject: [PATCH 49/57] [CI/Build][Doc] Clean up more docs that point to old
 bench scripts (#21667)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
---
 .buildkite/nightly-benchmarks/README.md       | 10 ++--
 .../convert-results-json-to-markdown.py       |  6 +-
 .../scripts/run-nightly-benchmarks.sh         |  2 +-
 .../scripts/run-performance-benchmarks.sh     |  8 +--
 benchmarks/auto_tune/README.md                |  2 +-
 .../disagg_overhead_benchmark.sh              | 60 +++++++++----------
 .../disagg_performance_benchmark.sh           | 30 +++++-----
 docs/contributing/profiling.md                |  9 ++-
 .../prometheus_grafana/README.md              |  2 +-
 9 files changed, 66 insertions(+), 63 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index cdf6a645147e5..ae42f70077cec 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -74,7 +74,7 @@ Here is an example of one test inside `latency-tests.json`:
 In this example:
 
 - The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
-- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
+- The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
 
 Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
 
@@ -82,13 +82,13 @@ WARNING: The benchmarking script will save json results by itself, so please do
 
 ### Throughput test
 
-The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
+The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`.
 
 The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
 
 ### Serving test
 
-We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
+We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
 
 ```json
 [
@@ -118,8 +118,8 @@ Inside this example:
 
 - The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
 - The `server-parameters` includes the command line arguments for vLLM server.
-- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
-- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
+- The `client-parameters` includes the command line arguments for `vllm bench serve`.
+- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve`
 
 The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
 
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 724b53056ca8f..05623879c0c2c 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -100,7 +100,7 @@ if __name__ == "__main__":
             raw_result = json.loads(f.read())
 
         if "serving" in str(test_file):
-            # this result is generated via `benchmark_serving.py`
+            # this result is generated via `vllm bench serve` command
 
             # attach the benchmarking command to raw_result
             try:
@@ -120,7 +120,7 @@ if __name__ == "__main__":
             continue
 
         elif "latency" in f.name:
-            # this result is generated via `benchmark_latency.py`
+            # this result is generated via `vllm bench latency` command
 
             # attach the benchmarking command to raw_result
             try:
@@ -148,7 +148,7 @@ if __name__ == "__main__":
             continue
 
         elif "throughput" in f.name:
-            # this result is generated via `benchmark_throughput.py`
+            # this result is generated via `vllm bench throughput` command
 
             # attach the benchmarking command to raw_result
             try:
diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
index 29b8ccbf0a2f7..06d7b5ed484da 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -127,7 +127,7 @@ ensure_installed() {
 }
 
 run_serving_tests() {
-  # run serving tests using `benchmark_serving.py`
+  # run serving tests using `vllm bench serve` command
   # $1: a json file specifying serving test cases
 
   local serving_test_file
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index 476a37b33a110..b515ee43934d1 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -165,7 +165,7 @@ upload_to_buildkite() {
 }
 
 run_latency_tests() {
-  # run latency tests using `benchmark_latency.py`
+  # run latency tests using `vllm bench latency` command
   # $1: a json file specifying latency test cases
 
   local latency_test_file
@@ -232,7 +232,7 @@ run_latency_tests() {
 }
 
 run_throughput_tests() {
-  # run throughput tests using `benchmark_throughput.py`
+  # run throughput tests using `vllm bench throughput`
   # $1: a json file specifying throughput test cases
 
   local throughput_test_file
@@ -298,7 +298,7 @@ run_throughput_tests() {
 }
 
 run_serving_tests() {
-  # run serving tests using `benchmark_serving.py`
+  # run serving tests using `vllm bench serve` command
   # $1: a json file specifying serving test cases
 
   local serving_test_file
@@ -448,7 +448,7 @@ main() {
   (which jq) || (apt-get update && apt-get -y install jq)
   (which lsof) || (apt-get update && apt-get install -y lsof)
 
-  # get the current IP address, required by benchmark_serving.py
+  # get the current IP address, required by `vllm bench serve` command
   export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
   # turn of the reporting of the status of each request, to clean up the terminal output
   export VLLM_LOGGING_LEVEL="WARNING"
diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md
index ae5962fe92542..c479ff1aa29c0 100644
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@@ -105,7 +105,7 @@ After the script finishes, you will find the results in a new, timestamped direc
 
 - **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run:
     - `vllm_log_...txt`: The log output from the vLLM server for each parameter combination.
-    - `bm_log_...txt`: The log output from the `benchmark_serving.py` script for each benchmark run.
+    - `bm_log_...txt`: The log output from the `vllm bench serve` command for each benchmark run.
 
 - **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found.
 
diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
index b150b01949658..92f97ffabea2a 100644
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -3,7 +3,7 @@
 # benchmark the overhead of disaggregated prefill.
 # methodology:
 # - send all request to prefill vLLM instance. It will buffer KV cache.
-# - then send all request to decode instance. 
+# - then send all request to decode instance.
 # - The TTFT of decode instance is the overhead.
 
 set -ex
@@ -63,7 +63,7 @@ benchmark() {
     --gpu-memory-utilization 0.6 \
     --kv-transfer-config \
     '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
-    
+
 
   CUDA_VISIBLE_DEVICES=1 python3 \
     -m vllm.entrypoints.openai.api_server \
@@ -78,38 +78,38 @@ benchmark() {
   wait_for_server 8200
 
   # let the prefill instance finish prefill
-  python3 ../benchmark_serving.py \
-          --backend vllm \
-          --model $model \
-          --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
-          --sonnet-input-len $input_len \
-          --sonnet-output-len "$output_len" \
-          --sonnet-prefix-len $prefix_len \
-          --num-prompts $num_prompts \
-          --port 8100 \
-          --save-result \
-          --result-dir $results_folder \
-          --result-filename disagg_prefill_tp1.json \
-          --request-rate "inf"
+  vllm bench serve \
+    --backend vllm \
+    --model $model \
+    --dataset-name $dataset_name \
+    --dataset-path $dataset_path \
+    --sonnet-input-len $input_len \
+    --sonnet-output-len "$output_len" \
+    --sonnet-prefix-len $prefix_len \
+    --num-prompts $num_prompts \
+    --port 8100 \
+    --save-result \
+    --result-dir $results_folder \
+    --result-filename disagg_prefill_tp1.json \
+    --request-rate "inf"
 
 
   # send the request to decode.
   # The TTFT of this command will be the overhead of disagg prefill impl.
-  python3 ../benchmark_serving.py \
-          --backend vllm \
-          --model $model \
-          --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
-          --sonnet-input-len $input_len \
-          --sonnet-output-len "$output_len" \
-          --sonnet-prefix-len $prefix_len \
-          --num-prompts $num_prompts \
-          --port 8200 \
-          --save-result \
-          --result-dir $results_folder \
-          --result-filename disagg_prefill_tp1_overhead.json \
-          --request-rate "$qps"
+  vllm bench serve \
+    --backend vllm \
+    --model $model \
+    --dataset-name $dataset_name \
+    --dataset-path $dataset_path \
+    --sonnet-input-len $input_len \
+    --sonnet-output-len "$output_len" \
+    --sonnet-prefix-len $prefix_len \
+    --num-prompts $num_prompts \
+    --port 8200 \
+    --save-result \
+    --result-dir $results_folder \
+    --result-filename disagg_prefill_tp1_overhead.json \
+    --request-rate "$qps"
   kill_gpu_processes
 
 }
diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
index c5a483f2ff22b..af2bcba3ea57a 100644
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -60,7 +60,7 @@ launch_chunked_prefill() {
 
 
 launch_disagg_prefill() {
-  model="meta-llama/Meta-Llama-3.1-8B-Instruct" 
+  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
   # disagg prefill
   CUDA_VISIBLE_DEVICES=0 python3 \
     -m vllm.entrypoints.openai.api_server \
@@ -99,20 +99,20 @@ benchmark() {
   output_len=$2
   tag=$3
 
-  python3 ../benchmark_serving.py \
-          --backend vllm \
-          --model $model \
-          --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
-          --sonnet-input-len $input_len \
-          --sonnet-output-len "$output_len" \
-          --sonnet-prefix-len $prefix_len \
-          --num-prompts $num_prompts \
-          --port 8000 \
-          --save-result \
-          --result-dir $results_folder \
-          --result-filename "$tag"-qps-"$qps".json \
-          --request-rate "$qps"
+  vllm bench serve \
+    --backend vllm \
+    --model $model \
+    --dataset-name $dataset_name \
+    --dataset-path $dataset_path \
+    --sonnet-input-len $input_len \
+    --sonnet-output-len "$output_len" \
+    --sonnet-prefix-len $prefix_len \
+    --num-prompts $num_prompts \
+    --port 8000 \
+    --save-result \
+    --result-dir $results_folder \
+    --result-filename "$tag"-qps-"$qps".json \
+    --request-rate "$qps"
 
   sleep 2
 }
diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
index aa3de617e072c..13c3bc2c7e031 100644
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -9,10 +9,13 @@ We support tracing vLLM workers using the `torch.profiler` module. You can enabl
 
 The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
 
-When using `benchmarks/benchmark_serving.py`, you can enable profiling by passing the `--profile` flag.
+When using `vllm bench serve`, you can enable profiling by passing the `--profile` flag.
 
 Traces can be visualized using <https://ui.perfetto.dev/>.
 
+!!! tip
+You can directly call bench module without installing vllm using `python -m vllm.entrypoints.cli.main bench`.
+
 !!! tip
     Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
 
@@ -35,7 +38,7 @@ VLLM_TORCH_PROFILER_DIR=./vllm_profile \
     --model meta-llama/Meta-Llama-3-70B
 ```
 
-benchmark_serving.py:
+vllm bench command:
 
 ```bash
 vllm bench serve \
@@ -69,7 +72,7 @@ apt install nsight-systems-cli
 
 For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference.
 
-The following is an example using the `benchmarks/benchmark_latency.py` script:
+The following is an example using the `vllm bench latency` script:
 
 ```bash
 nsys profile -o report.nsys-rep \
diff --git a/examples/online_serving/prometheus_grafana/README.md b/examples/online_serving/prometheus_grafana/README.md
index 6df9594516664..7c4e649e6d029 100644
--- a/examples/online_serving/prometheus_grafana/README.md
+++ b/examples/online_serving/prometheus_grafana/README.md
@@ -28,7 +28,7 @@ Submit some sample requests to the server:
 ```bash
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 
-python3 ../../../benchmarks/benchmark_serving.py \
+vllm bench serve \
     --model mistralai/Mistral-7B-v0.1 \
     --tokenizer mistralai/Mistral-7B-v0.1 \
     --endpoint /v1/completions \

From a8936e5193e7889032ebe8678d86af54df8f4491 Mon Sep 17 00:00:00 2001
From: "ZiTian.Zhao" <261131814@qq.com>
Date: Sun, 27 Jul 2025 12:06:21 +0800
Subject: [PATCH 50/57] Refactor: Remove numpy dependency from
 LoggingStatLogger (#20529)

Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
---
 vllm/v1/metrics/loggers.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 7f2556bab5a40..3b0616952babf 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -6,7 +6,6 @@ import time
 from abc import ABC, abstractmethod
 from typing import Callable, Optional, Union
 
-import numpy as np
 import prometheus_client
 
 from vllm.config import SupportsMetricsInfo, VllmConfig
@@ -67,18 +66,20 @@ class LoggingStatLogger(StatLoggerBase):
         self.last_log_time = now
 
         # Tracked stats over current local logging interval.
-        self.num_prompt_tokens: list[int] = []
-        self.num_generation_tokens: list[int] = []
+        self.num_prompt_tokens: int = 0
+        self.num_generation_tokens: int = 0
 
     def _track_iteration_stats(self, iteration_stats: IterationStats):
         # Save tracked stats for token counters.
-        self.num_prompt_tokens.append(iteration_stats.num_prompt_tokens)
-        self.num_generation_tokens.append(
-            iteration_stats.num_generation_tokens)
+        self.num_prompt_tokens += iteration_stats.num_prompt_tokens
+        self.num_generation_tokens += iteration_stats.num_generation_tokens
 
-    def _get_throughput(self, tracked_stats: list[int], now: float) -> float:
+    def _get_throughput(self, tracked_stats: int, now: float) -> float:
         # Compute summary metrics for tracked stats
-        return float(np.sum(tracked_stats) / (now - self.last_log_time))
+        delta_time = now - self.last_log_time
+        if delta_time <= 0.0:
+            return 0.0
+        return float(tracked_stats / delta_time)
 
     def record(self,
                scheduler_stats: Optional[SchedulerStats],

From 1cbf951ba272c230823b947631065b826409fa62 Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Sun, 27 Jul 2025 13:14:51 +0800
Subject: [PATCH 51/57] [Misc] add default value for file pattern arg (#21659)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 examples/offline_inference/save_sharded_state.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py
index 9b154e370642b..d6b8b7e6838d7 100644
--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/offline_inference/save_sharded_state.py
@@ -29,6 +29,7 @@ import shutil
 from pathlib import Path
 
 from vllm import LLM, EngineArgs
+from vllm.model_executor.model_loader import ShardedStateLoader
 from vllm.utils import FlexibleArgumentParser
 
 
@@ -39,7 +40,10 @@ def parse_args():
         "--output", "-o", required=True, type=str, help="path to output checkpoint"
     )
     parser.add_argument(
-        "--file-pattern", type=str, help="string pattern of saved filenames"
+        "--file-pattern",
+        type=str,
+        default=ShardedStateLoader.DEFAULT_PATTERN,
+        help="string pattern of saved filenames",
     )
     parser.add_argument(
         "--max-file-size",

From 5f8c9a425ec99fcc2d7e547a7b9a0493490efd52 Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Sun, 27 Jul 2025 02:43:02 -0700
Subject: [PATCH 52/57] Migrate Florence2ImagePixelInputs to TensorSchema
 (#21663)

Signed-off-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/model_executor/models/florence2.py | 52 ++++++++++++-------------
 1 file changed, 24 insertions(+), 28 deletions(-)

diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 1bedac29af9f3..399c739f408ee 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -4,7 +4,7 @@
 import math
 from collections import OrderedDict
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, TypedDict, Union
+from typing import Annotated, Literal, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -29,16 +29,28 @@ from vllm.multimodal.processing import (BaseProcessingInfo,
                                         PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
                          SupportsV0Only)
 from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings
 
 
-class Florence2ImagePixelInputs(TypedDict):
+class Florence2ImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - b: Batch size
+        - c: Number of channels (3)
+        - h: Height of the image
+        - w: Width of the image
+    """
+
     type: Literal["pixel_values"]
-    data: torch.Tensor
-    """Shape: (batch_size, num_channel, height, width)"""
+
+    data: Annotated[
+        torch.Tensor,
+        TensorShape("b", 3, "h", "w"),
+    ]
 
 
 # ViT implementation are all copied from
@@ -931,28 +943,6 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
             raise NotImplementedError(
                 'Florence2 only supports COSINE as temporal embedding.')
 
-    def _validate_pixel_values(
-        self, data: Union[torch.Tensor, list[torch.Tensor]]
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
-
-        size = self.processor_config["size"]
-        h, w = size["height"], size["width"]
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape)
-
-            if actual_dims != expected_dims:
-                expected_expr = tuple(*map(str, expected_dims))
-                raise ValueError(
-                    "The expected shape of pixel values per batch "
-                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
     def _parse_and_validate_image_input(self, **kwargs: object):
         pixel_values: Optional[Union[list[list[torch.Tensor]],
                                      list[torch.Tensor],
@@ -971,10 +961,16 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
                 "Both pixel values and image embeds are provided.")
 
         if pixel_values is not None:
+            size = self.processor_config["size"]
+            expected_h, expected_w = size["height"], size["width"]
+
             return Florence2ImagePixelInputs(
                 type="pixel_values",
-                data=self._validate_pixel_values(
-                    flatten_bn(pixel_values, concat=True)),
+                data=flatten_bn(pixel_values, concat=True),
+                resolve_bindings={
+                    "h": expected_h,
+                    "w": expected_w
+                },
             )
 
         if image_embeds is not None:

From 3d847a3125cd640c1f31a1ba28c0b24c0fd201a8 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 27 Jul 2025 19:49:43 +0800
Subject: [PATCH 53/57] [VLM] Add video support for Intern-S1 (#21671)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 docs/models/supported_models.md               |   2 +-
 examples/offline_inference/vision_language.py |   8 +-
 .../multimodal/processing/test_common.py      |   1 +
 vllm/model_executor/models/interns1.py        | 205 ++++++++++++++----
 vllm/model_executor/models/internvl.py        |   1 -
 5 files changed, 170 insertions(+), 47 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 989bd5fff9655..355ac57094195 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -593,7 +593,7 @@ Specified using `--task generate`.
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
-| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> | `internlm/Intern-S1`, etc. | | ✅︎ | ✅︎ |
+| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | | ✅︎ | ✅︎ |
 | `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
 | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 8af59110230e1..6f23a29e72f71 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -470,8 +470,6 @@ def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
 
 # Intern-S1
 def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
-    assert modality == "image"
-
     model_name = "internlm/Intern-S1"
 
     engine_args = EngineArgs(
@@ -483,7 +481,11 @@ def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
         enforce_eager=True,
     )
 
-    placeholder = "<IMG_CONTEXT>"
+    if modality == "image":
+        placeholder = "<IMG_CONTEXT>"
+    elif modality == "video":
+        placeholder = "<video>"
+
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
     messages = [
         [{"role": "user", "content": f"{placeholder}\n{question}"}]
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 03bdafdbb9311..6405477193a44 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -278,6 +278,7 @@ def _test_processing_correctness_one(
     "THUDM/GLM-4.1V-9B-Thinking",
     "ibm-granite/granite-speech-3.3-2b",
     "h2oai/h2ovl-mississippi-800m",
+    "internlm/Intern-S1",
     "OpenGVLab/InternVL2-1B",
     "OpenGVLab/InternVL3-1B",
     "HuggingFaceM4/Idefics3-8B-Llama3",
diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py
index 36204e4c5953f..ab21cbe91aa1f 100644
--- a/vllm/model_executor/models/interns1.py
+++ b/vllm/model_executor/models/interns1.py
@@ -9,9 +9,10 @@
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Literal, Optional, TypedDict, Union
 
+import regex as re
 import torch
 import torch.nn as nn
-from transformers import InternVLProcessor, PretrainedConfig
+from transformers import BatchFeature, InternVLProcessor, PretrainedConfig
 from transformers.activations import ACT2FN
 from transformers.models.got_ocr2.image_processing_got_ocr2_fast import (
     GotOcr2ImageProcessorFast)
@@ -139,13 +140,13 @@ def get_interns1_target_ratios(
 
 
 class InternS1ProcessingInfo(BaseProcessingInfo):
-    """Basic image-only ProcessingInfo for InternS1-style models."""
+    """ProcessingInfo for InternS1-style models."""
 
     def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
         return self.ctx.get_hf_processor(InternVLProcessor, **kwargs)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None}
+        return {"image": None, "video": None}
 
     def get_num_image_tokens(
         self,
@@ -218,16 +219,35 @@ class InternS1ProcessingInfo(BaseProcessingInfo):
             processor=processor.image_processor,
         )
 
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
+
+        processor = self.get_hf_processor()
+
+        max_image_tokens = self.get_max_image_tokens() * max_images
+        max_total_frames = (seq_len -
+                            max_image_tokens) // processor.image_seq_length
+        max_frames_per_video = max_total_frames // max(max_videos, 1)
+
+        return max(max_frames_per_video, 1)
+
 
 class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo]
                                  ):
-    """Basic image-only DummyInputsBuilder for InternS1-style models."""
+    """DummyInputsBuilder for InternS1-style models."""
 
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
         image_token = self.info.get_hf_processor().image_token
+        video_token = self.info.get_hf_processor().video_token
 
-        return image_token * num_images
+        return image_token * num_images + video_token * num_videos
 
     def get_dummy_mm_data(
         self,
@@ -236,13 +256,24 @@ class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo]
     ) -> MultiModalDataDict:
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
+        target_num_frames = \
+                self.info.get_num_frames_with_most_features(seq_len, mm_counts)
         num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        config = self.info.get_hf_config()
+        image_size_h, image_size_w = config.vision_config.image_size
 
         return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
-                                   num_images=num_images)
+                                   num_images=num_images),
+            "video":
+            self._get_dummy_videos(width=image_size_w,
+                                   height=image_size_h,
+                                   num_frames=target_num_frames,
+                                   num_videos=num_videos),
         }
 
 
@@ -257,33 +288,89 @@ class InternS1MultiModalProcessor(
         mm_kwargs: Mapping[str, object],
         tok_kwargs: Mapping[str, object],
     ) -> Mapping[str, NestedTensors]:
-        processed_outputs = super()._call_hf_processor(
-            prompt=prompt,
-            mm_data=mm_data,
-            mm_kwargs=mm_kwargs,
-            tok_kwargs=tok_kwargs,
-        )
+        mm_data = dict(mm_data)
+        videos = mm_data.pop("videos", [])
+        images = mm_data.pop("images", [])
+        assert isinstance(videos, list)
+        assert isinstance(images, list)
 
         hf_processor = self.info.get_hf_processor(**mm_kwargs)
-        image_token_id = hf_processor.image_token_id
+        tokenizer = hf_processor.tokenizer
+        video_token_id = tokenizer.encode(hf_processor.video_token,
+                                          add_special_tokens=False)
+        assert len(video_token_id) == 1
+        video_token_id = video_token_id[0]
 
-        # Since there may be extra tokens in the feature placeholders,
-        # we need to pass the image token ID to the model to select the
-        # tokens to merge from the vision encoder outputs
-        processed_outputs["image_token_id"] = torch.tensor(image_token_id)
-        images = mm_data.get('images', None)
-        image_processor = self.info.get_hf_processor().image_processor
-        if images is not None:
-            image_inputs = image_processor(images=images)
-            image_num_patches = image_inputs.pop("num_patches")
-            if not isinstance(image_num_patches, list):
-                raise ValueError(
-                    f'num_patches is supposed to be list, but got '
-                    f'{type(image_num_patches)}')
-            image_num_patches = torch.tensor(image_num_patches)
-            processed_outputs['image_num_patches'] = image_num_patches
+        prompt = re.sub(hf_processor.image_token, "<image_placeholder>",
+                        prompt)
+        prompt = re.sub(hf_processor.video_token, "<video_placeholder>",
+                        prompt)
 
-        return processed_outputs
+        image_outputs = {}
+        if images:
+            image_pixel_values = []
+            for image in images:
+                processed_outputs = super()._call_hf_processor(
+                    prompt=hf_processor.image_token,
+                    mm_data={"images": image},
+                    mm_kwargs=mm_kwargs,
+                    tok_kwargs=tok_kwargs,
+                )
+                image_pixel_values.append(
+                    processed_outputs.pop("pixel_values"))
+
+                input_ids = processed_outputs.pop("input_ids")
+                image_placeholder = tokenizer.batch_decode(input_ids)[0]
+                prompt = prompt.replace("<image_placeholder>",
+                                        image_placeholder, 1)
+
+            num_patches = [len(item) for item in image_pixel_values]
+            image_outputs: dict[str, NestedTensors] = {
+                "pixel_values": torch.concat(image_pixel_values),
+                "image_num_patches": torch.tensor(num_patches),
+                "image_token_id": torch.tensor(hf_processor.image_token_id),
+            }
+
+        video_outputs = {}
+        if videos:
+            video_pixel_values = []
+            for video in videos:
+                processed_outputs = super()._call_hf_processor(
+                    prompt=hf_processor.video_token,
+                    mm_data={"videos": video},
+                    mm_kwargs=mm_kwargs,
+                    tok_kwargs=tok_kwargs,
+                )
+                video_pixel_values.append(
+                    processed_outputs.pop("pixel_values"))
+
+                input_ids = processed_outputs.pop("input_ids")
+                input_ids[input_ids ==
+                          hf_processor.image_token_id] = video_token_id
+
+                video_placeholder = tokenizer.batch_decode(input_ids)[0]
+                prompt = prompt.replace("<video_placeholder>",
+                                        video_placeholder, 1)
+
+            num_frames = [len(item) for item in video_pixel_values]
+            video_outputs: dict[str, NestedTensors] = {
+                "pixel_values_videos": torch.concat(video_pixel_values),
+                "video_num_patches": torch.tensor(num_frames),
+                "video_token_id": torch.tensor(video_token_id),
+            }
+
+        prompt = re.sub("<image_placeholder>", hf_processor.image_token,
+                        prompt)
+        prompt = re.sub("<video_placeholder>", hf_processor.video_token,
+                        prompt)
+        text_outputs = tokenizer(prompt, **tok_kwargs, return_tensors="pt")
+
+        combined_outputs = dict(
+            **text_outputs,
+            **image_outputs,
+            **video_outputs,
+        )
+        return BatchFeature(combined_outputs)
 
     def _get_mm_fields_config(
         self,
@@ -292,7 +379,9 @@ class InternS1MultiModalProcessor(
     ) -> Mapping[str, MultiModalFieldConfig]:
 
         image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0))
+        video_num_patches = hf_inputs.get("video_num_patches", torch.empty(0))
         num_images = len(image_num_patches)
+        num_videos = len(video_num_patches)
 
         return dict(
             pixel_values=MultiModalFieldConfig.flat_from_sizes(
@@ -300,6 +389,10 @@ class InternS1MultiModalProcessor(
             image_num_patches=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
             image_token_id=MultiModalFieldConfig.shared("image", num_images),
+            pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
+                "video", video_num_patches),
+            video_num_patches=MultiModalFieldConfig.batched("video"),
+            video_token_id=MultiModalFieldConfig.shared("video", num_videos),
         )
 
     def _get_prompt_updates(
@@ -312,32 +405,61 @@ class InternS1MultiModalProcessor(
         img_context_token = hf_processor.image_token
         start_image_token = hf_processor.start_image_token
         end_image_token = hf_processor.end_image_token
+        video_token = hf_processor.video_token
 
-        def get_replacement(item_idx: int):
+        if "video_num_patches" in out_mm_kwargs:
+            video_num_patches = out_mm_kwargs["video_num_patches"]
+            assert isinstance(video_num_patches, torch.Tensor)
+            video_num_patches = video_num_patches.tolist()
+        else:
+            video_num_patches = []
+
+        if "image_num_patches" in out_mm_kwargs:
+            image_num_patches = out_mm_kwargs["image_num_patches"]
+            assert isinstance(image_num_patches, torch.Tensor)
+            image_num_patches = image_num_patches.tolist()
+        else:
+            image_num_patches = []
+
+        def get_replacement_interns1_image(item_idx: int):
             images = mm_items.get_items(
                 "image", (ImageEmbeddingItems, ImageProcessorItems))
 
             if isinstance(images, ImageEmbeddingItems):
                 feature_size = images.get_feature_size(item_idx)
             else:
-                image_size = images.get_image_size(item_idx)
-                feature_size = self.info.get_num_image_tokens(
-                    image_width=image_size.width,
-                    image_height=image_size.height,
-                    processor=hf_processor.image_processor,
-                )
+                num_patches = image_num_patches[item_idx]
+                feature_size = num_patches * hf_processor.image_seq_length
 
             repl_features = img_context_token * feature_size
             repl_full = start_image_token + repl_features + end_image_token
             return PromptUpdateDetails.select_text(repl_full,
                                                    img_context_token)
 
+        def get_replacement_interns1_video(item_idx: int):
+            num_patches = video_num_patches[item_idx]
+            repl_features = video_token * hf_processor.image_seq_length
+            repl_features_with_sep = (start_image_token + repl_features +
+                                      end_image_token)
+            # num_patches is equal to num_frames
+            repl_full = '\n'.join([
+                f'Frame{i+1}: {repl_features_with_sep}'
+                for i in range(num_patches)
+            ])
+
+            return PromptUpdateDetails.select_text(repl_full, video_token)
+
         return [
             PromptReplacement(
                 modality="image",
                 target=img_context_token,
-                replacement=get_replacement,
-            )
+                replacement=get_replacement_interns1_image,
+            ),
+            PromptReplacement(
+                modality="video",
+                target=video_token,
+                replacement=get_replacement_interns1_video,
+            ),
         ]
 
 
@@ -514,7 +636,7 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
 
     def _parse_and_validate_video_input(
             self, **kwargs: object) -> Optional[InternS1VideoPixelInputs]:
-        pixel_values_flat_video = kwargs.pop("pixel_values_flat_video", None)
+        pixel_values_flat_video = kwargs.pop("pixel_values_videos", None)
         video_num_patches = kwargs.pop("video_num_patches", None)
         video_embeds = kwargs.pop("video_embeds", None)
 
@@ -595,8 +717,8 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
                              "image_embeds") and "images" not in modalities:
                 modalities["images"] = self._parse_and_validate_image_input(
                     **kwargs)
-            if input_key in ("pixel_values_flat_video",
-                             ) and "videos" not in modalities:
+            if input_key in (
+                    "pixel_values_videos", ) and "videos" not in modalities:
                 modalities["videos"] = self._parse_and_validate_video_input(
                     **kwargs)
 
@@ -614,7 +736,6 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
-            return None
 
         # The result multimodal_embeddings is tuple of tensors, with each
         # tensor correspoending to a multimodal data item (image or video).
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index f8b9ea2c5b6a0..3637f037751c0 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -1322,7 +1322,6 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
-            return None
 
         # The result multimodal_embeddings is tuple of tensors, with each
         # tensor correspoending to a multimodal data item (image or video).

From bda9d0535fa7d47169c4b50c6169e59a670aeb92 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Sun, 27 Jul 2025 08:25:21 -0400
Subject: [PATCH 54/57] [Refactor] Refactor MOE NVFP4 Code Base: ModelOpt +
 Compressed Tensor (#21631)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 tests/quantization/test_compressed_tensors.py |  4 +-
 .../compressed_tensors/compressed_tensors.py  |  2 +-
 .../compressed_tensors_moe.py                 | 39 +++--------
 .../layers/quantization/modelopt.py           | 66 ++-----------------
 .../utils/nvfp4_emulation_utils.py            | 15 +----
 .../layers/quantization/utils/quant_utils.py  | 55 ++++++++++++++++
 6 files changed, 75 insertions(+), 106 deletions(-)

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index db7e50eff72bc..296743dbfa041 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -17,7 +17,9 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
     CompressedTensorsW4A4Fp4, CompressedTensorsW4A16Fp4,
     CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
     CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
-    CompressedTensorsWNA16, cutlass_fp4_supported)
+    CompressedTensorsWNA16)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    cutlass_fp4_supported)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     sparse_cutlass_supported)
 from vllm.platforms import current_platform
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 90b45e32a688d..bc348df84d016 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -33,7 +33,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     find_matched_target, is_activation_quantization_format,
     should_ignore_layer)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
-from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import (  # noqa: E501
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
     cutlass_fp4_supported)
 from vllm.platforms import current_platform
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 7da52ce6ff8c8..8f69636dda7bf 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -27,8 +27,8 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
     prepare_moe_fp4_layer_for_marlin)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     prepare_moe_fp8_layer_for_marlin)
-from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import (  # noqa: E501
-    cutlass_fp4_supported)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    cutlass_fp4_supported, swizzle_blockscale)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
 from vllm.model_executor.utils import set_weight_attrs
@@ -193,29 +193,6 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
             {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
         set_weight_attrs(w2_input_scale, extra_weight_attrs)
 
-    def swizzle_blockscale(self, scale: torch.tensor):
-        assert (scale.dtype == torch.float8_e4m3fn)
-        # Pad and blockwise interleave weight_scale
-        scale_ndim = scale.ndim
-        if scale.ndim == 2:
-            scale = scale.unsqueeze(0)
-        assert scale.ndim == 3
-        B, M, K = scale.shape
-        round_up_multiple = lambda x, m: (x + m - 1) // m * m
-        M_padded = round_up_multiple(M, 128)
-        K_padded = round_up_multiple(K, 4)
-        padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
-        padded_scale[:B, :M, :K] = scale
-        batches, rows, cols = padded_scale.shape
-        assert rows % 128 == 0
-        assert cols % 4 == 0
-        padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
-                                            cols // 4, 4)
-        swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
-        swizzled_scale = swizzled_scale.contiguous().cuda()
-        return (swizzled_scale.reshape(M, K)
-                if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
-
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
 
         # From packed to weight
@@ -243,13 +220,13 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
             return
 
         # swizzle weight scales
-        layer.w13_blockscale_swizzled = torch.nn.Parameter(
-            self.swizzle_blockscale(layer.w13_weight_scale),
-            requires_grad=False)
+        layer.w13_blockscale_swizzled = torch.nn.Parameter(swizzle_blockscale(
+            layer.w13_weight_scale),
+                                                           requires_grad=False)
 
-        layer.w2_blockscale_swizzled = torch.nn.Parameter(
-            self.swizzle_blockscale(layer.w2_weight_scale),
-            requires_grad=False)
+        layer.w2_blockscale_swizzled = torch.nn.Parameter(swizzle_blockscale(
+            layer.w2_weight_scale),
+                                                          requires_grad=False)
 
         # w13
         w13_input_global_scale = layer.w13_input_global_scale.max(
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 81611ed07aaa4..38866586ae29e 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -9,8 +9,7 @@ from torch.nn.parameter import Parameter
 
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
-from vllm._custom_ops import (cutlass_scaled_fp4_mm,
-                              cutlass_scaled_mm_supports_fp4, scaled_fp4_quant)
+from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.distributed import get_ep_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
@@ -28,7 +27,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
     apply_fp4_marlin_linear, is_fp4_marlin_supported,
     prepare_fp4_layer_for_marlin, prepare_moe_fp4_layer_for_marlin)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape, is_layer_skipped)
+    GroupShape, cutlass_fp4_supported, is_layer_skipped, swizzle_blockscale)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     Fp8LinearOp, requantize_with_max_scale)
 from vllm.model_executor.parameter import (ModelWeightParameter,
@@ -667,14 +666,6 @@ class ModelOptNvFp4Config(QuantizationConfig):
         return None
 
 
-def cutlass_fp4_supported() -> bool:
-    if not current_platform.is_cuda():
-        return False
-    capability_tuple = current_platform.get_device_capability()
-    capability = -1 if capability_tuple is None else capability_tuple.to_int()
-    return cutlass_scaled_mm_supports_fp4(capability)
-
-
 class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
     """
     Supports loading kv-cache scaling factors from FP8 checkpoints.
@@ -772,29 +763,6 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
 
         layer.register_parameter("weight_scale", weight_scale)
 
-    def swizzle_blockscale(self, scale: torch.tensor):
-        assert (scale.dtype == torch.float8_e4m3fn)
-        # Pad and blockwise interleave weight_scale
-        scale_ndim = scale.ndim
-        if scale.ndim == 2:
-            scale = scale.unsqueeze(0)
-        assert scale.ndim == 3
-        B, M, K = scale.shape
-        round_up_multiple = lambda x, m: (x + m - 1) // m * m
-        M_padded = round_up_multiple(M, 128)
-        K_padded = round_up_multiple(K, 4)
-        padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
-        padded_scale[:B, :M, :K] = scale
-        batches, rows, cols = padded_scale.shape
-        assert rows % 128 == 0
-        assert cols % 4 == 0
-        padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
-                                            cols // 4, 4)
-        swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
-        swizzled_scale = swizzled_scale.contiguous().cuda()
-        return (swizzled_scale.reshape(M, K)
-                if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
-
     def process_weights_after_loading(self, layer: Module) -> None:
 
         # global scales:
@@ -814,7 +782,7 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
             "Expected weight_scale.dim(1) to be divisible by 16")
         assert (layer.weight_scale.dtype == torch.float8_e4m3fn), (
             "Weight Block scale must be represented as FP8-E4M3")
-        swizzled_weight_scale = self.swizzle_blockscale(layer.weight_scale)
+        swizzled_weight_scale = swizzle_blockscale(layer.weight_scale)
 
         layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
                                                 requires_grad=False)
@@ -1060,29 +1028,6 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                                                  weight_loader=weight_loader)
         layer.register_parameter("w2_input_scale", w2_input_scale)
 
-    def swizzle_blockscale(self, scale: torch.tensor):
-        assert (scale.dtype == torch.float8_e4m3fn)
-        # Pad and blockwise interleave weight_scale
-        scale_ndim = scale.ndim
-        if scale.ndim == 2:
-            scale = scale.unsqueeze(0)
-        assert scale.ndim == 3
-        B, M, K = scale.shape
-        round_up_multiple = lambda x, m: (x + m - 1) // m * m
-        M_padded = round_up_multiple(M, 128)
-        K_padded = round_up_multiple(K, 4)
-        padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
-        padded_scale[:B, :M, :K] = scale
-        batches, rows, cols = padded_scale.shape
-        assert rows % 128 == 0
-        assert cols % 4 == 0
-        padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
-                                            cols // 4, 4)
-        swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
-        swizzled_scale = swizzled_scale.contiguous().cuda()
-        return (swizzled_scale.reshape(M, K)
-                if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
-
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # GEMM 1
         # The FlashInfer Cutlass fused MoE kernel expects the combined weights
@@ -1128,8 +1073,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             "Expected weight_scale.dim(1) to be divisible by 16")
         assert (layer.w13_weight_scale.dtype == torch.float8_e4m3fn), (
             "Weight Blockscale must be represented as FP8-E4M3")
-        w13_blockscale_swizzled = self.swizzle_blockscale(
-            layer.w13_weight_scale)
+        w13_blockscale_swizzled = swizzle_blockscale(layer.w13_weight_scale)
 
         layer.w13_blockscale_swizzled = Parameter(w13_blockscale_swizzled,
                                                   requires_grad=False)
@@ -1151,7 +1095,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             "Expected weight_scale.dim(1) to be divisible by 16")
         assert (layer.w2_weight_scale.dtype == torch.float8_e4m3fn), (
             "Weight Blockscale must be represented as FP8-E4M3")
-        w2_blockscale_swizzled = self.swizzle_blockscale(layer.w2_weight_scale)
+        w2_blockscale_swizzled = swizzle_blockscale(layer.w2_weight_scale)
 
         layer.w2_blockscale_swizzled = Parameter(w2_blockscale_swizzled,
                                                  requires_grad=False)
diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
index fb3287d3b89e6..8648771cb0177 100644
--- a/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
@@ -2,13 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 
-from vllm._custom_ops import cutlass_scaled_mm_supports_fp4
-from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
 __all__ = [
-    "break_fp4_bytes", "dequantize_to_dtype", "ref_nvfp4_quant",
-    "cutlass_fp4_supported"
+    "break_fp4_bytes",
+    "dequantize_to_dtype",
+    "ref_nvfp4_quant",
 ]
 
 FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
@@ -17,14 +16,6 @@ kE2M1ToFloat = torch.tensor([0., 0.5, 1., 1.5, 2., 3., 4., 6.],
                             dtype=torch.float32)
 
 
-def cutlass_fp4_supported() -> bool:
-    if not current_platform.is_cuda():
-        return False
-    capability_tuple = current_platform.get_device_capability()
-    capability = -1 if capability_tuple is None else capability_tuple.to_int()
-    return cutlass_scaled_mm_supports_fp4(capability)
-
-
 def break_fp4_bytes(a, dtype):
     assert a.dtype == torch.uint8
     m, n = a.shape
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 54361a2323c28..428e9e99aa881 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -8,8 +8,10 @@ from typing import ClassVar, NamedTuple, Optional
 import numpy
 import torch
 
+from vllm._custom_ops import cutlass_scaled_mm_supports_fp4
 from vllm.model_executor.layers.quantization.qqq import (
     MARLIN_QQQ_SUPPORTED_NUM_BITS)
+from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
 
 
@@ -592,3 +594,56 @@ def awq_pack(
     q_w = q_w.reshape((-1, size_n)).contiguous()
 
     return pack_cols(q_w, num_bits, size_k, size_n)
+
+
+def swizzle_blockscale(scale: torch.Tensor) -> torch.Tensor:
+    """
+    Pad and block-interleave the FP4 block-scales so that they match the data
+    layout expected by the CUTLASS / FlashInfer kernels.
+
+    Parameters
+    ----------
+    scale: torch.Tensor
+
+    Returns
+    -------
+    torch.Tensor
+        The swizzled tensor with the same logical shape as *scale*.
+    """
+    assert scale.dtype == torch.float8_e4m3fn, (
+        "swizzle_blockscale expects the input tensor to be in "
+        "torch.float8_e4m3fn format.")
+
+    scale_ndim = scale.ndim
+    if scale_ndim == 2:
+        scale = scale.unsqueeze(0)  # (1, M, K)
+    assert scale.ndim == 3, "Expected a 2-D or 3-D tensor for block scales."
+
+    B, M, K = scale.shape
+
+    def _round_up(x: int, m: int) -> int:
+        return (x + m - 1) // m * m
+
+    M_padded = _round_up(M, 128)
+    K_padded = _round_up(K, 4)
+
+    padded = torch.zeros((B, M_padded, K_padded),
+                         dtype=scale.dtype,
+                         device=scale.device)
+    padded[:B, :M, :K] = scale
+
+    # Reshape / permute to the layout required by the kernel.
+    padded = padded.reshape(B, M_padded // 128, 4, 32, K_padded // 4, 4)
+    swizzled = padded.permute(0, 1, 4, 3, 2, 5).contiguous().cuda()
+
+    if scale_ndim == 2:
+        return swizzled.reshape(M, K)
+    return swizzled.reshape(B, M, K)
+
+
+def cutlass_fp4_supported() -> bool:
+    if not current_platform.is_cuda():
+        return False
+    capability_tuple = current_platform.get_device_capability()
+    capability = -1 if capability_tuple is None else capability_tuple.to_int()
+    return cutlass_scaled_mm_supports_fp4(capability)

From 57c22e57f989b466a46a990243bb7f072a668b7f Mon Sep 17 00:00:00 2001
From: Caleb_Du <59528230+CalebDu@users.noreply.github.com>
Date: Sun, 27 Jul 2025 22:08:00 +0800
Subject: [PATCH 55/57] Fix CUDA permute/unpermute for use with DeepGemm Moe
 (#17934)

Signed-off-by: Caleb_Du <Caleb_Du@zju.edu.cn>
---
 .../benchmark_moe_permute_unpermute.py        |  76 +++++-----
 csrc/moe/moe_permute_unpermute_op.cu          |  73 +++++-----
 .../moe_permute_unpermute_kernel.cu           |   2 +-
 .../moe_permute_unpermute_kernel.h            |  20 +--
 .../moe_permute_unpermute_kernel.inl          |  39 +++--
 csrc/moe/torch_bindings.cpp                   |  11 +-
 .../kernels/moe/test_moe_permute_unpermute.py | 136 +++++++++++-------
 .../layers/fused_moe/moe_permute_unpermute.py |  92 ++++++------
 8 files changed, 238 insertions(+), 211 deletions(-)

diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
index 4ed6900901442..04d2205aa3722 100644
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -8,12 +8,13 @@ import ray
 import torch
 from transformers import AutoConfig
 
-from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+from vllm.model_executor.layers.fused_moe.fused_moe import *
+from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
     _moe_permute,
     _moe_unpermute_and_reduce,
+    moe_permute,
+    moe_unpermute,
 )
-from vllm.model_executor.layers.fused_moe.fused_moe import *
-from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import *
 from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
 from vllm.platforms import current_platform
 from vllm.utils import FlexibleArgumentParser
@@ -63,18 +64,19 @@ def benchmark_permute(
 
     def run():
         if use_customized_permute:
-            (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = (
-                moe_permute(
-                    qhidden_states,
-                    topk_weights=topk_weights,
-                    topk_ids=topk_ids,
-                    token_expert_indices=token_expert_indices,
-                    topk=topk,
-                    n_expert=num_experts,
-                    n_local_expert=num_experts,
-                    expert_map=None,
-                    align_block_size=align_block_size,
-                )
+            (
+                permuted_hidden_states,
+                a1q_scale,
+                first_token_off,
+                inv_perm_idx,
+                m_indices,
+            ) = moe_permute(
+                qhidden_states,
+                a1q_scale=None,
+                topk_ids=topk_ids,
+                n_expert=num_experts,
+                expert_map=None,
+                align_block_size=align_block_size,
             )
         else:
             (
@@ -150,18 +152,19 @@ def benchmark_unpermute(
 
     def prepare():
         if use_customized_permute:
-            (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = (
-                moe_permute(
-                    qhidden_states,
-                    topk_weights=topk_weights,
-                    topk_ids=topk_ids,
-                    token_expert_indices=token_expert_indices,
-                    topk=topk,
-                    n_expert=num_experts,
-                    n_local_expert=num_experts,
-                    expert_map=None,
-                    align_block_size=align_block_size,
-                )
+            (
+                permuted_hidden_states,
+                a1q_scale,
+                first_token_off,
+                inv_perm_idx,
+                m_indices,
+            ) = moe_permute(
+                qhidden_states,
+                a1q_scale=None,
+                topk_ids=topk_ids,
+                n_expert=num_experts,
+                expert_map=None,
+                align_block_size=align_block_size,
             )
             # convert to fp16/bf16 as gemm output
             return (
@@ -191,16 +194,19 @@ def benchmark_unpermute(
 
     def run(input: tuple):
         if use_customized_permute:
-            (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = input
+            (
+                permuted_hidden_states,
+                first_token_off,
+                inv_perm_idx,
+                m_indices,
+            ) = input
+            output = torch.empty_like(hidden_states)
             moe_unpermute(
+                output,
                 permuted_hidden_states,
                 topk_weights,
-                topk_ids,
                 inv_perm_idx,
                 first_token_off,
-                topk,
-                num_experts,
-                num_experts,
             )
         else:
             (
@@ -211,7 +217,11 @@ def benchmark_unpermute(
                 inv_perm,
             ) = input
             _moe_unpermute_and_reduce(
-                output_hidden_states, permuted_hidden_states, inv_perm, topk_weights
+                output_hidden_states,
+                permuted_hidden_states,
+                inv_perm,
+                topk_weights,
+                True,
             )
 
     # JIT compilation & warmup
diff --git a/csrc/moe/moe_permute_unpermute_op.cu b/csrc/moe/moe_permute_unpermute_op.cu
index a77471a7f2078..2922352a3f7cc 100644
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@@ -10,32 +10,28 @@
 
 void moe_permute(
     const torch::Tensor& input,                      // [n_token, hidden]
-    const torch::Tensor& topk_weights,               //[n_token, topk]
-    torch::Tensor& topk_ids,                         // [n_token, topk]
+    const torch::Tensor& topk_ids,                   // [n_token, topk]
     const torch::Tensor& token_expert_indices,       // [n_token, topk]
     const std::optional<torch::Tensor>& expert_map,  // [n_expert]
     int64_t n_expert, int64_t n_local_expert, int64_t topk,
     const std::optional<int64_t>& align_block_size,
-    torch::Tensor&
-        permuted_input,  // [topk * n_token/align_block_size_m, hidden]
+    torch::Tensor& permuted_input,             // [permuted_size, hidden]
     torch::Tensor& expert_first_token_offset,  // [n_local_expert + 1]
-    torch::Tensor& src_row_id2dst_row_id_map,  // [n_token, topk]
+    torch::Tensor& inv_permuted_idx,           // [n_token, topk]
+    torch::Tensor& permuted_idx,               // [permute_size]
     torch::Tensor& m_indices) {                // [align_expand_m]
-  TORCH_CHECK(topk_weights.scalar_type() == at::ScalarType::Float,
-              "topk_weights must be float32");
   TORCH_CHECK(expert_first_token_offset.scalar_type() == at::ScalarType::Long,
               "expert_first_token_offset must be int64");
   TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
               "topk_ids must be int32");
   TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
               "token_expert_indices must be int32");
-  TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
-              "src_row_id2dst_row_id_map must be int32");
+  TORCH_CHECK(inv_permuted_idx.scalar_type() == at::ScalarType::Int,
+              "inv_permuted_idx must be int32");
   TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
               "expert_first_token_offset shape != n_local_expert+1")
-  TORCH_CHECK(
-      src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(),
-      "token_expert_indices shape must be same as src_row_id2dst_row_id_map");
+  TORCH_CHECK(inv_permuted_idx.sizes() == token_expert_indices.sizes(),
+              "token_expert_indices shape must be same as inv_permuted_idx");
   auto n_token = input.sizes()[0];
   auto n_hidden = input.sizes()[1];
   auto align_block_size_value =
@@ -46,8 +42,9 @@ void moe_permute(
   auto sort_workspace = torch::empty(
       {sorter_size},
       torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
+  auto copy_topk_ids = topk_ids.clone();  // copy topk_ids for preprocess
   auto permuted_experts_id = torch::empty_like(topk_ids);
-  auto dst_row_id2src_row_id_map = torch::empty_like(src_row_id2dst_row_id_map);
+  auto sorted_row_idx = torch::empty_like(inv_permuted_idx);
   auto align_expert_first_token_offset =
       torch::zeros_like(expert_first_token_offset);
 
@@ -67,24 +64,22 @@ void moe_permute(
     const int* expert_map_ptr = get_ptr<int>(expert_map.value());
     valid_num_ptr =
         get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
-    preprocessTopkIdLauncher(get_ptr<int>(topk_ids), n_token * topk,
+    preprocessTopkIdLauncher(get_ptr<int>(copy_topk_ids), n_token * topk,
                              expert_map_ptr, n_expert, stream);
   }
   // expert sort topk expert id and scan expert id get expert_first_token_offset
-  sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indices),
-                    get_ptr<int>(permuted_experts_id),
-                    get_ptr<int>(dst_row_id2src_row_id_map),
-                    get_ptr<int64_t>(expert_first_token_offset), n_token,
-                    n_expert, n_local_expert, topk, sorter,
-                    get_ptr<int>(sort_workspace), stream);
+  sortAndScanExpert(
+      get_ptr<int>(copy_topk_ids), get_ptr<int>(token_expert_indices),
+      get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
+      get_ptr<int64_t>(expert_first_token_offset), n_token, n_expert,
+      n_local_expert, topk, sorter, get_ptr<int>(sort_workspace), stream);
 
   // dispatch expandInputRowsKernelLauncher
   MOE_DISPATCH(input.scalar_type(), [&] {
     expandInputRowsKernelLauncher<scalar_t>(
         get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
-        get_ptr<float>(topk_weights), get_ptr<int>(permuted_experts_id),
-        get_ptr<int>(dst_row_id2src_row_id_map),
-        get_ptr<int>(src_row_id2dst_row_id_map),
+        get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
+        get_ptr<int>(inv_permuted_idx), get_ptr<int>(permuted_idx),
         get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
         n_hidden, topk, n_local_expert, align_block_size_value, stream);
   });
@@ -101,32 +96,34 @@ void moe_permute(
 }
 
 void moe_unpermute(
-    const torch::Tensor& permuted_hidden_states,     // [n_token * topk, hidden]
-    const torch::Tensor& topk_weights,               //[n_token, topk]
-    const torch::Tensor& topk_ids,                   // [n_token, topk]
-    const torch::Tensor& src_row_id2dst_row_id_map,  // [n_token, topk]
-    const torch::Tensor& expert_first_token_offset,  // [n_local_expert+1]
-    int64_t n_expert, int64_t n_local_expert, int64_t topk,
+    const torch::Tensor& permuted_hidden_states,  // [n_token * topk, hidden]
+    const torch::Tensor& topk_weights,            // [n_token, topk]
+    const torch::Tensor& inv_permuted_idx,        // [n_token, topk]
+    const std::optional<torch::Tensor>&
+        expert_first_token_offset,  // [n_local_expert+1]
+    int64_t topk,
     torch::Tensor& hidden_states  // [n_token, hidden]
 ) {
-  TORCH_CHECK(src_row_id2dst_row_id_map.sizes() == topk_ids.sizes(),
-              "topk_ids shape must be same as src_row_id2dst_row_id_map");
-  TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
-              "topk_ids must be int32");
   TORCH_CHECK(
       permuted_hidden_states.scalar_type() == hidden_states.scalar_type(),
-      "topk_ids dtype must be same as src_row_id2dst_row_id_map");
+      "permuted_hidden_states dtype must be same as hidden_states");
   auto n_token = hidden_states.size(0);
   auto n_hidden = hidden_states.size(1);
   auto stream = at::cuda::getCurrentCUDAStream().stream();
-  const int64_t* valid_ptr =
-      get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
+
+  int64_t const* valid_ptr = nullptr;
+  if (expert_first_token_offset.has_value()) {
+    int n_local_expert = expert_first_token_offset.value().size(0) - 1;
+    valid_ptr =
+        get_ptr<int64_t>(expert_first_token_offset.value()) + n_local_expert;
+  }
+
   MOE_DISPATCH(hidden_states.scalar_type(), [&] {
     finalizeMoeRoutingKernelLauncher<scalar_t, scalar_t>(
         get_ptr<scalar_t>(permuted_hidden_states),
         get_ptr<scalar_t>(hidden_states), get_ptr<float>(topk_weights),
-        get_ptr<int>(src_row_id2dst_row_id_map), get_ptr<int>(topk_ids),
-        n_token, n_hidden, topk, valid_ptr, stream);
+        get_ptr<int>(inv_permuted_idx), n_token, n_hidden, topk, valid_ptr,
+        stream);
   });
 }
 
diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
index de2c153882d93..2271c1bc75b1f 100644
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
@@ -177,7 +177,7 @@ __global__ void getMIndicesKernel(int64_t* expert_first_token_offset,
   int tidx = threadIdx.x;
   extern __shared__ int64_t smem_expert_first_token_offset[];
   for (int i = tidx; i <= num_local_expert; i += blockDim.x) {
-    smem_expert_first_token_offset[tidx] = __ldg(expert_first_token_offset + i);
+    smem_expert_first_token_offset[i] = __ldg(expert_first_token_offset + i);
   }
   __syncthreads();
   auto last_token_offset = smem_expert_first_token_offset[eidx + 1];
diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
index 43c29721cd16e..108091efbefa8 100644
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
@@ -57,31 +57,19 @@ void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
 
 template <typename T>
 void expandInputRowsKernelLauncher(
-    T const* unpermuted_input, T* permuted_output,
-    const float* unpermuted_scales, int* sorted_experts,
+    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
     int const* expanded_dest_row_to_expanded_source_row,
-    int* expanded_source_row_to_expanded_dest_row,
+    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
     int64_t* expert_first_token_offset, int64_t const num_rows,
     int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
     int num_local_experts, const int& align_block_size, cudaStream_t stream);
 
-// Final kernel to unpermute and scale
-// This kernel unpermutes the original data, does the k-way reduction and
-// performs the final skip connection.
-template <typename T, typename OutputType, bool CHECK_SKIPPED>
-__global__ void finalizeMoeRoutingKernel(
-    T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
-    float const* scales, int const* expanded_source_row_to_expanded_dest_row,
-    int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
-    int64_t const* num_valid_ptr);
-
 template <class T, class OutputType>
 void finalizeMoeRoutingKernelLauncher(
     T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
     float const* scales, int const* expanded_source_row_to_expanded_dest_row,
-    int const* expert_for_source_row, int64_t const num_rows,
-    int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
-    cudaStream_t stream);
+    int64_t const num_rows, int64_t const cols, int64_t const k,
+    int64_t const* num_valid_ptr, cudaStream_t stream);
 
 void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
                               const int* expert_map_ptr, int num_experts,
diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
index ad0d390665a00..449243b92a283 100644
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
@@ -2,10 +2,9 @@
 
 template <typename T, bool CHECK_SKIPPED, bool ALIGN_BLOCK_SIZE>
 __global__ void expandInputRowsKernel(
-    T const* unpermuted_input, T* permuted_output,
-    const float* unpermuted_scales, int* sorted_experts,
+    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
     int const* expanded_dest_row_to_expanded_source_row,
-    int* expanded_source_row_to_expanded_dest_row,
+    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
     int64_t* expert_first_token_offset, int64_t const num_rows,
     int64_t const* num_dest_rows, int64_t const cols, int64_t k,
     int num_local_experts, int align_block_size) {
@@ -54,6 +53,10 @@ __global__ void expandInputRowsKernel(
     assert(expanded_dest_row <= INT32_MAX);
     expanded_source_row_to_expanded_dest_row[expanded_source_row] =
         static_cast<int>(expanded_dest_row);
+    // skip non local expert token
+    if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
+      permuted_idx[expanded_dest_row] = expanded_source_row;
+    }
   }
 
   if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
@@ -62,7 +65,7 @@ __global__ void expandInputRowsKernel(
     using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
 
     // Duplicate and permute rows
-    int64_t const source_row = expanded_source_row % num_rows;
+    int64_t const source_row = expanded_source_row / k;
 
     auto const* source_row_ptr =
         reinterpret_cast<DataElem const*>(unpermuted_input + source_row * cols);
@@ -82,10 +85,9 @@ __global__ void expandInputRowsKernel(
 
 template <typename T>
 void expandInputRowsKernelLauncher(
-    T const* unpermuted_input, T* permuted_output,
-    const float* unpermuted_scales, int* sorted_experts,
+    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
     int const* expanded_dest_row_to_expanded_source_row,
-    int* expanded_source_row_to_expanded_dest_row,
+    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
     int64_t* expert_first_token_offset, int64_t const num_rows,
     int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
     int num_local_experts, const int& align_block_size, cudaStream_t stream) {
@@ -105,11 +107,11 @@ void expandInputRowsKernelLauncher(
   int64_t smem_size = sizeof(int64_t) * (num_local_experts + 1);
 
   func<<<blocks, threads, smem_size, stream>>>(
-      unpermuted_input, permuted_output, unpermuted_scales, sorted_experts,
+      unpermuted_input, permuted_output, sorted_experts,
       expanded_dest_row_to_expanded_source_row,
-      expanded_source_row_to_expanded_dest_row, expert_first_token_offset,
-      num_rows, num_valid_tokens_ptr, cols, k, num_local_experts,
-      align_block_size);
+      expanded_source_row_to_expanded_dest_row, permuted_idx,
+      expert_first_token_offset, num_rows, num_valid_tokens_ptr, cols, k,
+      num_local_experts, align_block_size);
 }
 
 template <class T, class U>
@@ -128,11 +130,9 @@ template <typename T, typename OutputType, bool CHECK_SKIPPED>
 __global__ void finalizeMoeRoutingKernel(
     T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
     float const* scales, int const* expanded_source_row_to_expanded_dest_row,
-    int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
-    int64_t const* num_valid_ptr) {
+    int64_t const orig_cols, int64_t const k, int64_t const* num_valid_ptr) {
   assert(orig_cols % 4 == 0);
   int64_t const original_row = blockIdx.x;
-  int64_t const num_rows = gridDim.x;
   auto const offset = original_row * orig_cols;
   OutputType* reduced_row_ptr = reduced_unpermuted_output + offset;
   int64_t const num_valid = *num_valid_ptr;
@@ -159,14 +159,13 @@ __global__ void finalizeMoeRoutingKernel(
     ComputeElem thread_output;
     thread_output.fill(0);
     for (int k_idx = 0; k_idx < k; ++k_idx) {
-      int64_t const expanded_original_row = original_row + k_idx * num_rows;
+      int64_t const expanded_original_row = original_row * k + k_idx;
       int64_t const expanded_permuted_row =
           expanded_source_row_to_expanded_dest_row[expanded_original_row];
 
       int64_t const k_offset = original_row * k + k_idx;
       float const row_scale = scales[k_offset];
 
-      // Check after row_rescale has accumulated
       if (CHECK_SKIPPED && expanded_permuted_row >= num_valid) {
         continue;
       }
@@ -189,9 +188,8 @@ template <class T, class OutputType>
 void finalizeMoeRoutingKernelLauncher(
     T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
     float const* scales, int const* expanded_source_row_to_expanded_dest_row,
-    int const* expert_for_source_row, int64_t const num_rows,
-    int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
-    cudaStream_t stream) {
+    int64_t const num_rows, int64_t const cols, int64_t const k,
+    int64_t const* num_valid_ptr, cudaStream_t stream) {
   int64_t const blocks = num_rows;
   int64_t const threads = 256;
   bool const check_finished = num_valid_ptr != nullptr;
@@ -201,6 +199,5 @@ void finalizeMoeRoutingKernelLauncher(
   auto* const kernel = func_map[check_finished];
   kernel<<<blocks, threads, 0, stream>>>(
       expanded_permuted_rows, reduced_unpermuted_output, scales,
-      expanded_source_row_to_expanded_dest_row, expert_for_source_row, cols, k,
-      num_valid_ptr);
+      expanded_source_row_to_expanded_dest_row, cols, k, num_valid_ptr);
 }
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index 97df311d04409..d96e082f6ef11 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -56,18 +56,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       " -> Tensor");
 
   m.def(
-      "moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids,"
+      "moe_permute(Tensor input, Tensor topk_ids,"
       "Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
       "int n_local_expert,"
       "int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
-      "expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! "
-      "m_indices)->()");
+      "expert_first_token_offset, Tensor! inv_permuted_idx, Tensor! "
+      "permuted_idx, Tensor! m_indices)->()");
 
   m.def(
       "moe_unpermute(Tensor permuted_hidden_states, Tensor topk_weights,"
-      "Tensor topk_ids,Tensor src_row_id2dst_row_id_map, Tensor "
-      "expert_first_token_offset, int n_expert, int n_local_expert,int "
-      "topk, Tensor! hidden_states)->()");
+      "Tensor inv_permuted_idx, Tensor? expert_first_token_offset, "
+      "int topk, Tensor! hidden_states)->()");
 
   m.def("moe_permute_unpermute_supported() -> bool");
   m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py
index 7cc83b512c8b9..8d215a0cbeed8 100644
--- a/tests/kernels/moe/test_moe_permute_unpermute.py
+++ b/tests/kernels/moe/test_moe_permute_unpermute.py
@@ -17,28 +17,34 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
     moe_permute, moe_permute_unpermute_supported, moe_unpermute)
 from vllm.platforms import current_platform
 
-NUM_EXPERTS = [16, 64]
+NUM_EXPERTS = [16, 64, 256]
 TOP_KS = [2, 4, 6, 8]
 EP_SIZE = [1, 4, 16]
 current_platform.seed_everything(0)
 
 
-def torch_permute(hidden_states: torch.Tensor,
-                  topk_ids: torch.Tensor,
-                  token_expert_indices: torch.Tensor,
-                  topk: int,
-                  n_expert: int,
-                  n_local_expert: int,
-                  start_expert: int,
-                  expert_map: Optional[torch.Tensor] = None,
-                  align_block_size: Optional[int] = None,
-                  fill_invalid_expert: int = -1) -> list[torch.Tensor]:
+def torch_permute(
+        hidden_states: torch.Tensor,
+        topk_ids: torch.Tensor,
+        #   token_expert_indices: torch.Tensor,
+        topk: int,
+        n_expert: int,
+        n_local_expert: int,
+        start_expert: int,
+        expert_map: Optional[torch.Tensor] = None,
+        align_block_size: Optional[int] = None,
+        fill_invalid_expert: int = -1) -> list[torch.Tensor]:
     n_token, n_hidden = hidden_states.shape[0], hidden_states.shape[1]
     if expert_map is not None:
         is_local_expert = (expert_map[topk_ids] != -1)
         not_local_expert = (expert_map[topk_ids] == -1)
         topk_ids = is_local_expert * (
             topk_ids - start_expert) + not_local_expert * (topk_ids + n_expert)
+    token_expert_indices = torch.arange(0,
+                                        n_token * topk,
+                                        dtype=torch.int32,
+                                        device=hidden_states.device).reshape(
+                                            (n_token, topk))
 
     sorted_topk_ids, sorted_indices = torch.sort(topk_ids.flatten(),
                                                  stable=True)
@@ -59,8 +65,8 @@ def torch_permute(hidden_states: torch.Tensor,
     valid_row_idx = []
     if align_block_size is None:
 
-        permuted_hidden_states = hidden_states[dst_row_id2src_row_id_map %
-                                               n_token, ...]
+        permuted_hidden_states = hidden_states[dst_row_id2src_row_id_map //
+                                               topk, ...]
         permuted_row_size = permuted_hidden_states.shape[0]
         m_indices = torch.empty(permuted_row_size,
                                 device="cuda",
@@ -73,14 +79,21 @@ def torch_permute(hidden_states: torch.Tensor,
             0, n_token * topk, device="cuda",
             dtype=torch.int32)[src2dst_idx].reshape((n_token, topk))
         valid_row_idx += [i for i in range(expert_first_token_offset[-1])]
+        dst_row_id2src_row_id_map[
+            expert_first_token_offset[-1]:] = n_token * topk
         return [
             permuted_hidden_states, expert_first_token_offset,
-            src_row_id2dst_row_id_map, m_indices, valid_row_idx
+            src_row_id2dst_row_id_map, dst_row_id2src_row_id_map, m_indices,
+            valid_row_idx
         ]
     else:
         permuted_row_size = (topk * n_token + n_expert *
                              (align_block_size - 1) + align_block_size -
                              1) // align_block_size * align_block_size
+        permuted_idx = torch.full((permuted_row_size, ),
+                                  n_token * topk,
+                                  dtype=torch.int32,
+                                  device=hidden_states.device)
         permuted_hidden_states = torch.empty((permuted_row_size, n_hidden),
                                              device="cuda",
                                              dtype=hidden_states.dtype)
@@ -105,13 +118,16 @@ def torch_permute(hidden_states: torch.Tensor,
             align_first_token_offset = align_expert_first_token_offset[i - 1]
             align_last_token_offset = align_expert_first_token_offset[i]
             dst_row_id2src_row_id_in_expert = dst_row_id2src_row_id_map[
-                first_token_offset:first_token_offset +
-                n_token_in_expert] % n_token
+                first_token_offset:first_token_offset + n_token_in_expert]
             # store token in current expert with align_first_token_offset
             permuted_hidden_states[align_first_token_offset:\
                                    align_first_token_offset+n_token_in_expert,\
                                       ...] = hidden_states[\
-                                       dst_row_id2src_row_id_in_expert, ...]
+                                       dst_row_id2src_row_id_in_expert // topk,\
+                                          ...]
+            permuted_idx[align_first_token_offset:\
+                         align_first_token_offset+\
+                         n_token_in_expert] = dst_row_id2src_row_id_in_expert
             # set current expert m_indices
             m_indices[align_first_token_offset:align_last_token_offset] = i - 1
             valid_row_idx += [
@@ -135,7 +151,7 @@ def torch_permute(hidden_states: torch.Tensor,
             src2dst_idx].reshape((n_token, topk))
         return [
             permuted_hidden_states, align_expert_first_token_offset,
-            align_src_row_id2dst_row_id, m_indices, valid_row_idx
+            align_src_row_id2dst_row_id, permuted_idx, m_indices, valid_row_idx
         ]
 
 
@@ -146,15 +162,18 @@ def torch_unpermute(permuted_hidden_states: torch.Tensor,
                     valid_row_idx: torch.Tensor, topk: int,
                     n_expert: int) -> torch.Tensor:
     # ignore invalid row
+    n_hidden = permuted_hidden_states.shape[1]
     mask = torch.zeros(permuted_hidden_states.shape[0],
                        dtype=bool,
                        device="cuda")
     mask[valid_row_idx] = True
     permuted_hidden_states[~mask] = 0
-    idx = src_row_id2dst_row_id_map.flatten()[
-        token_expert_indices.flatten()].reshape(token_expert_indices.shape)
-    output = permuted_hidden_states[idx, ...] * topk_weights[..., None]
-    output = output.sum(dim=1).to(permuted_hidden_states.dtype)
+
+    permuted_hidden_states = permuted_hidden_states[
+        src_row_id2dst_row_id_map.flatten(), ...]
+    permuted_hidden_states = permuted_hidden_states.view(-1, topk, n_hidden)
+    output = (permuted_hidden_states * topk_weights.unsqueeze(2)).sum(1).to(
+        permuted_hidden_states.dtype)
     return output
 
 
@@ -184,43 +203,56 @@ def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int,
     gating_output = torch.randn((n_token, n_expert), device="cuda").to(dtype)
     topk_weights, topk_ids, token_expert_indices = fused_topk(
         hidden_states, gating_output, topk, False)
-    gold0, gold1, gold2, gold3, valid_row_idx = torch_permute(
-        hidden_states,
-        topk_ids,
-        token_expert_indices,
-        topk,
-        n_expert,
-        n_local_expert,
-        start_expert,
-        expert_map=expert_map,
-        align_block_size=align_block_size,
-        fill_invalid_expert=fill_invalid_expert)
+    (gold_permuted_hidden_states, gold_expert_first_token_offset,
+     gold_inv_permuted_idx, gold_permuted_idx, gold_m_indices,
+     valid_row_idx) = torch_permute(
+         hidden_states,
+         topk_ids,
+         # token_expert_indices,
+         topk,
+         n_expert,
+         n_local_expert,
+         start_expert,
+         expert_map=expert_map,
+         align_block_size=align_block_size,
+         fill_invalid_expert=fill_invalid_expert)
 
-    result0, result1, result2, result3 = moe_permute(
-        hidden_states, topk_weights, topk_ids, token_expert_indices, topk,
-        n_expert, n_local_expert, expert_map, align_block_size,
-        fill_invalid_expert)
+    (permuted_hidden_states, _, expert_first_token_offset, inv_permuted_idx,
+     m_indices) = moe_permute(hidden_states=hidden_states,
+                              a1q_scale=None,
+                              topk_ids=topk_ids,
+                              n_expert=n_expert,
+                              n_local_expert=n_local_expert,
+                              expert_map=expert_map,
+                              align_block_size=align_block_size,
+                              fill_invalid_expert=fill_invalid_expert)
 
     # check expert_first_token_offset
-    torch.testing.assert_close(gold1, result1, atol=0, rtol=0)
-    # check src_row_id2dst_row_id_map
-    torch.testing.assert_close(gold2, result2, atol=0, rtol=0)
-    # check mindice
-    torch.testing.assert_close(gold3, result3, atol=0, rtol=0)
-    # check permuted_hidden_states, only valid token
-    torch.testing.assert_close(gold0[valid_row_idx],
-                               result0[valid_row_idx],
+    torch.testing.assert_close(gold_expert_first_token_offset,
+                               expert_first_token_offset,
+                               atol=0,
+                               rtol=0)
+    # check src_row_id2dst_row_id_map
+    torch.testing.assert_close(gold_inv_permuted_idx.flatten(),
+                               inv_permuted_idx,
+                               atol=0,
+                               rtol=0)
+    # check mindice
+    torch.testing.assert_close(gold_m_indices, m_indices, atol=0, rtol=0)
+    # check permuted_hidden_states, only valid token
+    torch.testing.assert_close(gold_permuted_hidden_states[valid_row_idx],
+                               permuted_hidden_states[valid_row_idx],
                                atol=0,
                                rtol=0)
-
     # add a random tensor to simulate group gemm
-    result0 = 0.5 * result0 + torch.randn_like(result0)
+    result0 = 0.5 * permuted_hidden_states + torch.randn_like(
+        permuted_hidden_states)
+    result4 = torch.empty_like(hidden_states)
+    moe_unpermute(result4, result0, topk_weights, inv_permuted_idx,
+                  expert_first_token_offset)
 
-    result4 = moe_unpermute(result0, topk_weights, topk_ids, result2, result1,
-                            topk, n_expert, n_local_expert)
     gold4 = torch_unpermute(result0, topk_weights, topk_ids,
-                            token_expert_indices, result2, valid_row_idx, topk,
-                            n_local_expert)
-
+                            token_expert_indices, inv_permuted_idx,
+                            valid_row_idx, topk, n_local_expert)
     # check unpermuted hidden
     torch.testing.assert_close(result4, gold4, atol=2e-2, rtol=0)
diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
index 20ee0d9f780a7..d9059f50b4459 100644
--- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
+++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
@@ -76,43 +76,43 @@ def _moe_unpermute_and_reduce(
 
 def moe_permute(
     hidden_states: torch.Tensor,
-    topk_weights: torch.Tensor,
+    a1q_scale: Optional[torch.Tensor],
     topk_ids: torch.Tensor,
-    token_expert_indices: torch.Tensor,
-    topk: int,
     n_expert: int,
-    n_local_expert: int,
+    n_local_expert: int = -1,
     expert_map: Optional[torch.Tensor] = None,
     align_block_size: Optional[int] = None,
     fill_invalid_expert: int = -1
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
+           torch.Tensor]:
     """
     This function expands and permutes activation to gather uncontinuous tokens
       for each expert.
     Parameters:
     - hidden_states (torch.Tensor): The input tensor to the MoE layer.
-    - topk_weights (torch.Tensor): topk expert route weight for each token.
+    - a1q_scale (Optional[torch.Tensor]): quant scale for hidden_states
     - topk_ids (torch.Tensor): topk expert route id for each token.
-    - token_expert_indices (torch.Tensor): indice for expanded hidden.
-    - topk (int): The number of top-k experts to select.
     - n_expert (int): The number of expert.
     - n_local_expert (int): The number of expert in current EP rank.
     - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
-        from the global expert space to the local expert space of the expert
+        from the global expert space to the local expert space of the expert 
         parallel shard.
     - align_block_size (Optional[int]): align group gemm block size for deepgemm
     - fill_invalid_expert(int): fill expert id in m_indices for invalid expert
       to workaround DeepGemm unsupported -1 in m_indices
     Returns:
     - permuted_hidden_states (torch.Tensor): permuted activation.
+    - a1q_scale (Optional[torch.Tensor]): quant scale for hidden_states
     - expert_first_token_offset (torch.Tensor): offset of the first token
        of each expert for standard grouped gemm. if enable 'align_block_size'
        expert_first_token_offset will align up to 'align_block_size'.
-    - src_row_id2dst_row_id_map (torch.Tensor): idx map for moe_unpermute.
+    - inv_permuted_idx (torch.Tensor): idx map for moe_unpermute.
+    - permuted_idx (torch.Tensor): idx map from hidden to permuted_hidden.
     - m_indices: m_indices for grouped gemm in deepgemm,`m_indices[i]` records
     the group which the j-th row of the LHS belong to.`
     """
     n_token, n_hidden = hidden_states.size()
+    topk = topk_ids.size(1)
     assert (n_hidden * hidden_states.element_size()
             ) % 16 == 0, "permue kernel need hidden dim align to 16B"
     permuted_row_size = n_token * topk
@@ -120,12 +120,19 @@ def moe_permute(
         permuted_row_size = (permuted_row_size + n_expert *
                              (align_block_size - 1) + align_block_size -
                              1) // align_block_size * align_block_size
-
+    if n_local_expert == -1:
+        n_local_expert = n_expert
     permuted_hidden_states = torch.empty(
         (permuted_row_size, n_hidden),
         dtype=hidden_states.dtype,
         device=hidden_states.device,
     )
+    token_expert_indices = torch.arange(0,
+                                        n_token * topk,
+                                        dtype=torch.int32,
+                                        device=hidden_states.device).reshape(
+                                            (n_token, topk))
+
     m_indices = torch.full((permuted_row_size, ),
                            fill_invalid_expert,
                            dtype=torch.int32,
@@ -133,57 +140,54 @@ def moe_permute(
     expert_first_token_offset = torch.empty(n_local_expert + 1,
                                             dtype=torch.int64,
                                             device=hidden_states.device)
-    src_row_id2dst_row_id_map = torch.empty((n_token, topk),
-                                            dtype=torch.int32,
-                                            device=hidden_states.device)
-    torch.ops._moe_C.moe_permute(hidden_states, topk_weights, topk_ids,
-                                 token_expert_indices, expert_map, n_expert,
-                                 n_local_expert, topk, align_block_size,
-                                 permuted_hidden_states,
-                                 expert_first_token_offset,
-                                 src_row_id2dst_row_id_map, m_indices)
-    return (permuted_hidden_states, expert_first_token_offset,
-            src_row_id2dst_row_id_map, m_indices)
+    permuted_idx = torch.full((permuted_row_size, ),
+                              n_token * topk,
+                              dtype=torch.int32,
+                              device=hidden_states.device)
+    inv_permuted_idx = torch.empty((n_token, topk),
+                                   dtype=torch.int32,
+                                   device=hidden_states.device)
+    topk_ids = topk_ids.to(torch.int32)
+    torch.ops._moe_C.moe_permute(hidden_states, topk_ids, token_expert_indices,
+                                 expert_map, n_expert, n_local_expert, topk,
+                                 align_block_size, permuted_hidden_states,
+                                 expert_first_token_offset, inv_permuted_idx,
+                                 permuted_idx, m_indices)
+    if a1q_scale is not None:
+        a1q_scale = a1q_scale[permuted_idx.clamp(max=n_token * topk - 1) //
+                              topk]
+    return (permuted_hidden_states, a1q_scale, expert_first_token_offset,
+            inv_permuted_idx.flatten(), m_indices)
 
 
 def moe_unpermute(
+    out: torch.Tensor,
     permuted_hidden_states: torch.Tensor,
     topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    src_row_id2dst_row_id_map: torch.Tensor,
-    expert_first_token_offset: torch.Tensor,
-    topk: int,
-    n_expert: int,
-    n_local_expert: int,
-) -> torch.Tensor:
+    inv_permuted_idx: torch.Tensor,
+    expert_first_token_offset: Optional[torch.Tensor] = None,
+) -> None:
     """
     This function expands and permutes activation to gathering uncontinuous
       tokens for each expert.
     Parameters:
+    - out (torch.Tensor): output tensor
     - permuted_hidden_states (torch.Tensor): permuted activation.
     - topk_weights (torch.Tensor): topk expert route weight for each token.
-    - topk_ids (torch.Tensor): topk expert route id for each token.
-    - expert_first_token_offset (torch.Tensor): offset of the first token
-       of each expert for grouped gemm.
-    - topk (int): The number of top-k experts to select.
-    - n_expert (int): The number of expert.
-    - n_local_expert (int): The number of expert in current EP rank.
+    - inv_permuted_idx (torch.Tensor): row idx map for moe_unpermute.
+    - expert_first_token_offset (Optional[torch.Tensor]): offset of the first 
+      token of each expert for grouped gemm.
     Returns:
     - hidden_states (torch.Tensor): The reduced and unpermuted activation
       tensor.
     """
-    n_token, n_hidden = topk_weights.size(0), permuted_hidden_states.size(-1)
+    topk = topk_weights.size(1)
+    n_hidden = permuted_hidden_states.size(-1)
     assert (n_hidden * permuted_hidden_states.element_size()
             ) % 16 == 0, "unpermue kernel need hidden dim align to 16B"
-    hidden_states = torch.empty((n_token, n_hidden),
-                                dtype=permuted_hidden_states.dtype,
-                                device=permuted_hidden_states.device)
-
     torch.ops._moe_C.moe_unpermute(permuted_hidden_states, topk_weights,
-                                   topk_ids, src_row_id2dst_row_id_map,
-                                   expert_first_token_offset, n_expert,
-                                   n_local_expert, topk, hidden_states)
-    return hidden_states
+                                   inv_permuted_idx, expert_first_token_offset,
+                                   topk, out)
 
 
 def moe_permute_unpermute_supported():

From a9b2a1d7048c7129ee6f55af68624256da864fed Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Mon, 28 Jul 2025 00:51:44 +0800
Subject: [PATCH 56/57] [Misc] Refactor vllm config str (#21666)

---
 vllm/config.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 07df71ec51ef3..ea9f7dce894bb 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4790,26 +4790,26 @@ class VllmConfig:
 
     def __str__(self):
         return (
-            f"model={self.model_config.model!r},"
-            f" speculative_config={self.speculative_config!r},"
-            f" tokenizer={self.model_config.tokenizer!r}, "
-            f"skip_tokenizer_init={self.model_config.skip_tokenizer_init},"
-            f" tokenizer_mode={self.model_config.tokenizer_mode}, "
+            f"model={self.model_config.model!r}, "
+            f"speculative_config={self.speculative_config!r}, "
+            f"tokenizer={self.model_config.tokenizer!r}, "
+            f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
+            f"tokenizer_mode={self.model_config.tokenizer_mode}, "
             f"revision={self.model_config.revision}, "
-            f"override_neuron_config={self.model_config.override_neuron_config},"
-            f" tokenizer_revision={self.model_config.tokenizer_revision}, "
+            f"override_neuron_config={self.model_config.override_neuron_config}, "  # noqa
+            f"tokenizer_revision={self.model_config.tokenizer_revision}, "
             f"trust_remote_code={self.model_config.trust_remote_code}, "
             f"dtype={self.model_config.dtype}, "
-            f"max_seq_len={self.model_config.max_model_len},"
-            f" download_dir={self.load_config.download_dir!r}, "
+            f"max_seq_len={self.model_config.max_model_len}, "
+            f"download_dir={self.load_config.download_dir!r}, "
             f"load_format={self.load_config.load_format}, "
-            f"tensor_parallel_size={self.parallel_config.tensor_parallel_size},"
-            f" pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, "  # noqa
+            f"tensor_parallel_size={self.parallel_config.tensor_parallel_size}, "  # noqa
+            f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, "  # noqa
             f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, "  # noqa
             f"quantization={self.model_config.quantization}, "
             f"enforce_eager={self.model_config.enforce_eager}, "
             f"kv_cache_dtype={self.cache_config.cache_dtype}, "
-            f" device_config={self.device_config.device}, "
+            f"device_config={self.device_config.device}, "
             f"decoding_config={self.decoding_config!r}, "
             f"observability_config={self.observability_config!r}, "
             f"seed={self.model_config.seed}, "

From d80a82f961e2b3570f470171b9683d5e5ad06834 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <lwilkins@redhat.com>
Date: Sun, 27 Jul 2025 21:06:56 +0000
Subject: [PATCH 57/57] fix dp plus full cuda-graph

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/v1/attention/backends/mla/flashmla.py | 50 +++++++++++++---------
 1 file changed, 30 insertions(+), 20 deletions(-)

diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index d3e5300dbbd6b..51e77bc6bfa41 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -67,6 +67,20 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
         self.cg_buf_tile_scheduler_metadata = None
         self.cg_buf_num_splits = None
 
+        device_properties = torch.cuda.get_device_properties(self.device)
+        num_sms = device_properties.multi_processor_count
+
+        if self.compilation_config.full_cuda_graph:
+            self.cg_buf_tile_scheduler_metadata = torch.empty(
+                (num_sms, 8),  # TileSchedulerMetaDataSize == 8
+                device=self.device,
+                dtype=torch.int32,
+            )
+            self.cg_buf_num_splits = torch.empty(
+                (vllm_config.scheduler_config.max_num_seqs + 1),
+                device=self.device,
+                dtype=torch.int32)
+
     def _build_decode(self, block_table_tensor: torch.Tensor,
                       seq_lens: torch.Tensor) -> FlashMLADecodeMetadata:
         tile_scheduler_metadata, num_splits = \
@@ -77,28 +91,24 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
         )
 
         if self.compilation_config.full_cuda_graph:
-            # First time around (CUDAGraph capture), allocate the static buffer
-            if self.cg_buf_tile_scheduler_metadata is None:
-                self.cg_buf_tile_scheduler_metadata = tile_scheduler_metadata
-                self.cg_buf_num_splits = num_splits
-            else:
-                assert self.cg_buf_num_splits is not None
+            assert self.cg_buf_tile_scheduler_metadata is not None
+            assert self.cg_buf_num_splits is not None
 
-                # Metadata per-SM, fixed size (#SMs, TileMetadataSize)
-                assert (self.cg_buf_tile_scheduler_metadata.size() ==
-                        tile_scheduler_metadata.size())
-                self.cg_buf_tile_scheduler_metadata.\
-                    copy_(tile_scheduler_metadata)
-                tile_scheduler_metadata = self.cg_buf_tile_scheduler_metadata
+            # Metadata per-SM, fixed size (#SMs, TileMetadataSize)
+            assert (self.cg_buf_tile_scheduler_metadata.size() ==
+                    tile_scheduler_metadata.size())
+            self.cg_buf_tile_scheduler_metadata.\
+                copy_(tile_scheduler_metadata)
+            tile_scheduler_metadata = self.cg_buf_tile_scheduler_metadata
 
-                # Num splits is per-batch, varying size (batch_size,)
-                n = num_splits.size(0)
-                # make sure static buffer is large enough
-                assert n <= self.cg_buf_num_splits.size(0)
-                num_splits_view = self.cg_buf_num_splits[:n]
-                num_splits_view.copy_(num_splits)
-                self.cg_buf_num_splits[n:].fill_(0)  # fill the rest with 0s
-                num_splits = num_splits_view
+            # Num splits is per-batch, varying size (batch_size,)
+            n = num_splits.size(0)
+            # make sure static buffer is large enough
+            assert n <= self.cg_buf_num_splits.size(0)
+            num_splits_view = self.cg_buf_num_splits[:n]
+            num_splits_view.copy_(num_splits)
+            self.cg_buf_num_splits[n:].fill_(0)  # fill the rest with 0s
+            num_splits = num_splits_view
 
         return FlashMLADecodeMetadata(
             block_table=block_table_tensor,