Merge branch 'main' of github.com:vllm-project/vllm into conftest/generate_beam_search/simplify-return-value

2026-07-04 19:57:14 +08:00 · 2025-12-14 13:04:21 +00:00 · 2025-12-14 13:04:21 +00:00 · 76802a010b
commit 76802a010b
parent 6cb031a2a9 a8ec486592
59 changed files with 1743 additions and 384 deletions
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
+THRESHOLD=${1:-0.25}
+NUM_Q=${2:-1319}
+PORT=${3:-8040}
+OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
+mkdir -p "${OUT_DIR}"
+
+wait_for_server() {
+  local port=$1
+  timeout 600 bash -c '
+    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
+      sleep 1
+    done'
+}
+
+MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
+
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+  # ROCm platform
+  BACKENDS=("allgather_reducescatter")
+  # Disable MOE padding for ROCm since it is causing eplb to fail
+  export VLLM_ROCM_MOE_PADDING=0
+else
+  # Non-ROCm platform (CUDA/other)
+  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi
+
+cleanup() {
+  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
+    kill "${SERVER_PID}" 2>/dev/null || true
+    for _ in {1..20}; do
+      kill -0 "${SERVER_PID}" 2>/dev/null || break
+      sleep 0.5
+    done
+    kill -9 "${SERVER_PID}" 2>/dev/null || true
+  fi
+}
+trap cleanup EXIT
+
+for BACK in "${BACKENDS[@]}"; do
+  VLLM_DEEP_GEMM_WARMUP=skip \
+  VLLM_ALL2ALL_BACKEND=$BACK \
+  vllm serve "$MODEL" \
+    --enforce-eager \
+    --tensor-parallel-size 4 \
+    --enable-expert-parallel \
+    --enable-eplb \
+    --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
+    --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
+    --trust-remote-code \
+    --max-model-len 2048 \
+    --gpu-memory-utilization 0.9 \
+    --port $PORT &
+  SERVER_PID=$!
+  wait_for_server $PORT
+
+  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
+  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+  python3 - <<PY
+import json; acc=json.load(open('${OUT}'))['accuracy']
+print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
+assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
+PY
+
+  cleanup
+  SERVER_PID=
+  sleep 1
+  PORT=$((PORT+1))
+done
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@ -1629,7 +1629,6 @@ steps:
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_4
  # grade: Blocking
-  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace"
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@ -7,7 +7,7 @@ This guide covers optimization strategies and performance tuning for vLLM V1.

 ## Preemption

-Due to the auto-regressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests.
+Due to the autoregressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests.
 In such cases, vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes
 available again. When this occurs, you may see the following warning:

--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@ -82,7 +82,7 @@ DOCKER_BUILDKIT=1 docker build . \

 ## Building for Arm64/aarch64

-A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
+A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper and Grace-Blackwell. Using the flag `--platform "linux/arm64"` will build for arm64.

 !!! note
    Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
@ -104,6 +104,25 @@ A docker container can be built for aarch64 systems such as the Nvidia Grace-Hop
    --build-arg RUN_WHEEL_CHECK=false
    ```

+For (G)B300, we recommend using CUDA 13, as shown in the following command.
+
+??? console "Command"
+
+    ```bash
+    DOCKER_BUILDKIT=1 docker build \
+    --build-arg CUDA_VERSION=13.0.1 \
+    --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 \
+    --build-arg max_jobs=256 \
+    --build-arg nvcc_threads=2 \
+    --build-arg RUN_WHEEL_CHECK=false \
+    --build-arg torch_cuda_arch_list='9.0 10.0+PTX' \
+    --platform "linux/arm64" \
+    --tag vllm/vllm-gb300-openai:latest \
+    --target vllm-openai \
+    -f docker/Dockerfile \
+    .
+    ```
+
 !!! note
    If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution.

--- a/docs/deployment/integrations/production-stack.md
+++ b/docs/deployment/integrations/production-stack.md
@ -4,7 +4,7 @@ Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine le

 * **Upstream vLLM compatibility** – It wraps around upstream vLLM without modifying its code.
 * **Ease of use** – Simplified deployment via Helm charts and observability through Grafana dashboards.
-* **High performance** – Optimized for LLM workloads with features like multi-model support, model-aware and prefix-aware routing, fast vLLM bootstrapping, and KV cache offloading with [LMCache](https://github.com/LMCache/LMCache), among others.
+* **High performance** – Optimized for LLM workloads with features like multimodel support, model-aware and prefix-aware routing, fast vLLM bootstrapping, and KV cache offloading with [LMCache](https://github.com/LMCache/LMCache), among others.

 If you are new to Kubernetes, don't worry: in the vLLM production stack [repo](https://github.com/vllm-project/production-stack), we provide a step-by-step [guide](https://github.com/vllm-project/production-stack/blob/main/tutorials/00-install-kubernetes-env.md) and a [short video](https://www.youtube.com/watch?v=EsTJbQtzj0g) to set up everything and get started in **4 minutes**!

--- a/docs/design/cuda_graphs.md
+++ b/docs/design/cuda_graphs.md
@ -41,7 +41,7 @@ These features allow the most flexibility for cudagraph capture and compilation
 * `NONE` — turn CUDA Graphs off. Good for debugging.
 * `PIECEWISE` —  a single-mode strategy (and past default). It is the most flexible: attention or other CUDA Graphs-incompatible operations stay eager, everything else goes into CUDA Graphs. Requires piecewise compilation.
 * `FULL` — a single-mode strategy, which only captures full CUDA Graphs for non-uniform batches, then uniform-decode batches reuse the CUDA Graph of non-uniform batch of the same batch_size, since they are compatible; can be good for small models or workloads with small prompts.
-* `FULL_DECODE_ONLY` — full CUDA Graph for uniform decode, no cudagraph for prefill/mixed etc; suitable for decode instances in a P/D setup where prefill is not as important, this way we can save the memory needed for `PIECEWISE` CUDA Graphs.
+* `FULL_DECODE_ONLY` — full CUDA Graph for uniform decode, no cudagraph for prefill/mixed etc.; suitable for decode instances in a P/D setup where prefill is not as important, this way we can save the memory needed for `PIECEWISE` CUDA Graphs.
 * `FULL_AND_PIECEWISE` — (default mode) full CUDA Graph for uniform decode, piecewise CUDA Graphs for others; generally the most performant setting, especially for low latency with small models or MoEs, but also requires the most memory and takes the longest to capture.

 Defaults: If you’re on v1 with piecewise compilation, we default to `FULL_AND_PIECEWISE` for better performance, (for pooling models, it's still `PIECEWISE`). Otherwise, e.g. if piecewise compilation unavailable, we default to `NONE`.
@ -49,7 +49,7 @@ Defaults: If you’re on v1 with piecewise compilation, we default to `FULL_AND_
 While `NONE` , `PIECEWISE`, and `FULL` are single-mode configurations and simply equivalent to past implementations of eager execution, piecewise CUDA Graphs, and full CUDA Graphs respectively, `FULL_DECODE_ONLY` and `FULL_AND_PIECEWISE` are newly appended dual-mode configurations, which require dispatching to switch between concrete runtime modes according to runtime batches dynamically.

 !!! note
-    Here, the single-modes `NONE`, `PIECEWISE`, and `FULL` are treated as the runtime modes for CUDA Graphs dispatching. If using a dual-mode, the dispatcher will always dispatch to one of its member modes (plus a potantial `NONE` if no suitable CUDA Graph available), depending on the batch composition.
+    Here, the single-modes `NONE`, `PIECEWISE`, and `FULL` are treated as the runtime modes for CUDA Graphs dispatching. If using a dual-mode, the dispatcher will always dispatch to one of its member modes (plus a potential `NONE` if no suitable CUDA Graph available), depending on the batch composition.

 While cascade attention is not cudagraph compatible, it is now compatible with all possible cudagraph mode configurations. If a batch uses cascade attention, it always gets dispatched to `PIECEWISE` mode if available (otherwise `NONE`).

--- a/docs/design/optimization_levels.md
+++ b/docs/design/optimization_levels.md
@ -4,7 +4,7 @@

 ## Overview

-vLLM now supports optimization levels (`-O0`, `-O1`, `-O2`, `-O3`). Optimization levels provide an intuitive mechnaism for users to trade startup time for performance. Higher levels have better performance but worse startup time. These optimization levels have associated defaults to help users get desired out of the box performance. Importantly, defaults set by optimization levels are purely defaults; explicit user settings will not be overwritten.
+vLLM now supports optimization levels (`-O0`, `-O1`, `-O2`, `-O3`). Optimization levels provide an intuitive mechanism for users to trade startup time for performance. Higher levels have better performance but worse startup time. These optimization levels have associated defaults to help users get desired out-of-the-box performance. Importantly, defaults set by optimization levels are purely defaults; explicit user settings will not be overwritten.

 ## Level Summaries and Usage Examples
 ```bash
--- a/docs/design/paged_attention.md
+++ b/docs/design/paged_attention.md
@ -36,7 +36,7 @@ the input pointers `q`, `k_cache`, and `v_cache`, which point
 to query, key, and value data on global memory that need to be read
 and processed. The output pointer `out` points to global memory
 where the result should be written. These four pointers actually
-refer to multi-dimensional arrays, but each thread only accesses the
+refer to multidimensional arrays, but each thread only accesses the
 portion of data assigned to it. I have omitted all other runtime
 parameters here for simplicity.

@ -229,7 +229,7 @@ manner.

 ## QK

-As shown the pseudo code below, before the entire for loop block, we
+As shown the pseudocode below, before the entire for loop block, we
 fetch the query data for one token and store it in `q_vecs`. Then,
 in the outer for loop, we iterate through different `k_ptrs` that
 point to different tokens and prepare the `k_vecs` in the inner for
@ -403,7 +403,7 @@ for ... { // Iteration over different blocks.
 }
 ```

-As shown in the above pseudo code, in the outer loop, similar to
+As shown in the above pseudocode, in the outer loop, similar to
 `k_ptr`, `logits_vec` iterates over different blocks and reads
 `V_VEC_SIZE` elements from `logits`. In the inner loop, each
 thread reads `V_VEC_SIZE` elements from the same tokens as a
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -659,6 +659,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|--------|-------------------|----------------------|---------------------------|
 | `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | |
+| `AudioFlamingo3ForConditionalGeneration` | AudioFlamingo3 | T + A<sup>+</sup> | `nvidia/audio-flamingo-3-hf`, `nvidia/music-flamingo-hf` | ✅︎ | ✅︎ |
 | `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereLabs/aya-vision-8b`, `CohereLabs/aya-vision-32b`, etc. | | ✅︎ |
 | `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ |
 | `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ |
@ -743,7 +744,7 @@ Some models are supported only via the [Transformers modeling backend](#transfor
    - There's no PLE caching or out-of-memory swapping support, as described in [Google's blog](https://developers.googleblog.com/en/introducing-gemma-3n/). These features might be too model-specific for vLLM, and swapping in particular may be better suited for constrained setups.

 !!! note
-    For `InternVLChatModel`, only InternVL2.5 with Qwen2.5 text backbone (`OpenGVLab/InternVL2.5-1B` etc), InternVL3 and InternVL3.5 have video inputs support currently.
+    For `InternVLChatModel`, only InternVL2.5 with Qwen2.5 text backbone (`OpenGVLab/InternVL2.5-1B` etc.), InternVL3 and InternVL3.5 have video inputs support currently.

 !!! note
    To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
--- a/docs/serving/parallelism_scaling.md
+++ b/docs/serving/parallelism_scaling.md
@ -154,7 +154,7 @@ vllm serve /path/to/the/model/in/the/container \

 ## Optimizing network communication for tensor parallelism

-Efficient tensor parallelism requires fast inter-node communication, preferably through high-speed network adapters such as InfiniBand.
+Efficient tensor parallelism requires fast internode communication, preferably through high-speed network adapters such as InfiniBand.
 To set up the cluster to use InfiniBand, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the
 [examples/online_serving/run_cluster.sh](../../examples/online_serving/run_cluster.sh) helper script.
 Contact your system administrator for more information about the required flags.
--- a/docs/usage/security.md
+++ b/docs/usage/security.md
@ -10,7 +10,7 @@ All communications between nodes in a multi-node vLLM deployment are **insecure

 ### Configuration Options for Inter-Node Communications

-The following options control inter-node communications in vLLM:
+The following options control internode communications in vLLM:

 #### 1. **Environment Variables:**

@ -28,7 +28,7 @@ The following options control inter-node communications in vLLM:

 ### Notes on PyTorch Distributed

-vLLM uses PyTorch's distributed features for some inter-node communication. For
+vLLM uses PyTorch's distributed features for some internode communication. For
 detailed information about PyTorch Distributed security considerations, please
 refer to the [PyTorch Security
 Guide](https://github.com/pytorch/pytorch/security/policy#using-distributed-features).
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@ -42,60 +42,31 @@ class ModelRequestData(NamedTuple):
 # Unless specified, these settings have been tested to work on a single L4.


-# Voxtral
-# Make sure to install mistral-common[audio].
-def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
-    from mistral_common.audio import Audio
-    from mistral_common.protocol.instruct.chunk import (
-        AudioChunk,
-        RawAudio,
-        TextChunk,
-    )
-    from mistral_common.protocol.instruct.messages import (
-        UserMessage,
-    )
-    from mistral_common.protocol.instruct.request import ChatCompletionRequest
-    from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
-
-    model_name = "mistralai/Voxtral-Mini-3B-2507"
-    tokenizer = MistralTokenizer.from_hf_hub(model_name)
-
+# AudioFlamingo3
+def run_audioflamingo3(question: str, audio_count: int) -> ModelRequestData:
+    model_name = "nvidia/audio-flamingo-3-hf"
    engine_args = EngineArgs(
        model=model_name,
-        max_model_len=8192,
+        max_model_len=4096,
        max_num_seqs=2,
        limit_mm_per_prompt={"audio": audio_count},
-        config_format="mistral",
-        load_format="mistral",
-        tokenizer_mode="mistral",
        enforce_eager=True,
-        enable_chunked_prefill=False,
    )

-    text_chunk = TextChunk(text=question)
-    audios = [
-        Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
-        for i in range(audio_count)
-    ]
-    audio_chunks = [
-        AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
-    ]
+    # AudioFlamingo3 uses <sound> token for audio
+    audio_placeholder = "<sound>" * audio_count

-    messages = [UserMessage(content=[*audio_chunks, text_chunk])]
-
-    req = ChatCompletionRequest(messages=messages, model=model_name)
-
-    tokens = tokenizer.encode_chat_completion(req)
-    prompt_ids, audios = tokens.tokens, tokens.audios
-
-    audios_and_sr = [(au.audio_array, au.sampling_rate) for au in audios]
-
-    multi_modal_data = {"audio": audios_and_sr}
+    prompt = (
+        "<|im_start|>system\n"
+        "You are a helpful assistant.<|im_end|>\n"
+        "<|im_start|>user\n"
+        f"{audio_placeholder}{question}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )

    return ModelRequestData(
        engine_args=engine_args,
-        prompt_token_ids=prompt_ids,
-        multi_modal_data=multi_modal_data,
+        prompt=prompt,
    )


@ -361,6 +332,63 @@ def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
    )


+# Voxtral
+# Make sure to install mistral-common[audio].
+def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
+    from mistral_common.audio import Audio
+    from mistral_common.protocol.instruct.chunk import (
+        AudioChunk,
+        RawAudio,
+        TextChunk,
+    )
+    from mistral_common.protocol.instruct.messages import (
+        UserMessage,
+    )
+    from mistral_common.protocol.instruct.request import ChatCompletionRequest
+    from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+
+    model_name = "mistralai/Voxtral-Mini-3B-2507"
+    tokenizer = MistralTokenizer.from_hf_hub(model_name)
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"audio": audio_count},
+        config_format="mistral",
+        load_format="mistral",
+        tokenizer_mode="mistral",
+        enforce_eager=True,
+        enable_chunked_prefill=False,
+    )
+
+    text_chunk = TextChunk(text=question)
+    audios = [
+        Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
+        for i in range(audio_count)
+    ]
+    audio_chunks = [
+        AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
+    ]
+
+    messages = [UserMessage(content=[*audio_chunks, text_chunk])]
+
+    req = ChatCompletionRequest(messages=messages, model=model_name)
+
+    tokens = tokenizer.encode_chat_completion(req)
+    prompt_ids, audios = tokens.tokens, tokens.audios
+
+    audios_and_sr = [(au.audio_array, au.sampling_rate) for au in audios]
+
+    multi_modal_data = {"audio": audios_and_sr}
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt_token_ids=prompt_ids,
+        multi_modal_data=multi_modal_data,
+    )
+
+
 # Whisper
 def run_whisper(question: str, audio_count: int) -> ModelRequestData:
    assert audio_count == 1, "Whisper only support single audio input per prompt"
@ -382,7 +410,7 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:


 model_example_map = {
-    "voxtral": run_voxtral,
+    "audioflamingo3": run_audioflamingo3,
    "gemma3n": run_gemma3n,
    "granite_speech": run_granite_speech,
    "midashenglm": run_midashenglm,
@ -392,6 +420,7 @@ model_example_map = {
    "qwen2_audio": run_qwen2_audio,
    "qwen2_5_omni": run_qwen2_5_omni,
    "ultravox": run_ultravox,
+    "voxtral": run_voxtral,
    "whisper": run_whisper,
 }

--- a/examples/online_serving/structured_outputs/structured_outputs.py
+++ b/examples/online_serving/structured_outputs/structured_outputs.py
@ -112,7 +112,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
        "messages": [
            {
                "role": "user",
-                "content": "Generate an SQL query to show the 'username' and 'email'from the 'users' table.",
+                "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
            }
        ],
        "extra_body": {
--- a/tests/compile/test_dynamic_shapes_compilation.py
+++ b/tests/compile/test_dynamic_shapes_compilation.py
@ -36,7 +36,7 @@ def get_test_models():
        DynamicShapesType.BACKED_SIZE_OBLIVIOUS,
    ],
 )
-@pytest.mark.parametrize("use_aot_compile", ["0"])
+@pytest.mark.parametrize("use_aot_compile", ["0", "1"])
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
@pytest.mark.parametrize("evaluate_guards", [False, True])
@pytest.mark.skipif(
@ -54,6 +54,12 @@ def test_dynamic_shapes_compilation(
    if use_bytecode_hook and shapes_type == DynamicShapesType.UNBACKED:
        pytest.skip("UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0")

+    if evaluate_guards and shapes_type == DynamicShapesType.UNBACKED:
+        pytest.skip("unbacked dynamic shapes do not add guards")
+
+    if evaluate_guards and use_aot_compile:
+        pytest.skip("evaluate_guards requires use_aot_compile=0")
+
    monkeypatch.setenv("VLLM_USE_AOT_COMPILE", use_aot_compile)
    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")

@ -120,7 +126,7 @@ def test_model_specialization_with_evaluate_guards(
        and dynamic_shapes_type == DynamicShapesType.BACKED
        and evaluate_guards
    ):
-        pytest.skip("evaluate_guards for backed does not work with aot_compile =1")
+        pytest.skip("evaluate_guards for backed does not work with aot_compile=1")

    @support_torch_compile
    class ModelWithSizeCheck(torch.nn.Module):
--- a/tests/entrypoints/openai/test_chat_error.py
+++ b/tests/entrypoints/openai/test_chat_error.py
@ -80,10 +80,9 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
        return dict(engine_prompt), {}

    async def _fake_preprocess_chat(*args, **kwargs):
-        # return conversation, request_prompts, engine_prompts
+        # return conversation, engine_prompts
        return (
            [{"role": "user", "content": "Test"}],
-            [[1, 2, 3]],
            [{"prompt_token_ids": [1, 2, 3]}],
        )

--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@ -877,7 +877,7 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the first turn's input
        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
-        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
+        input_messages, _ = serving_chat._make_request_with_harmony(req)
        verify_harmony_messages(
            input_messages,
            [
@ -905,7 +905,7 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the second turn's input
        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
-        input_messages_2, _, _ = serving_chat._make_request_with_harmony(req_2)
+        input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
        verify_harmony_messages(
            input_messages_2,
            [
@ -927,7 +927,7 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the first turn's input
        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
-        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
+        input_messages, _ = serving_chat._make_request_with_harmony(req)
        verify_harmony_messages(
            input_messages,
            [
@ -971,7 +971,7 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the second turn's input
        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
-        input_messages_2, _, _ = serving_chat._make_request_with_harmony(req_2)
+        input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
        verify_harmony_messages(
            input_messages_2,
            [
@ -1008,7 +1008,7 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the first turn's input
        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
-        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
+        input_messages, _ = serving_chat._make_request_with_harmony(req)
        verify_harmony_messages(
            input_messages,
            [
@ -1052,7 +1052,7 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the second turn's input
        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
-        input_messages_2, _, _ = serving_chat._make_request_with_harmony(req_2)
+        input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
        verify_harmony_messages(
            input_messages_2,
            [
@ -1089,7 +1089,7 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the first turn's input
        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
-        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
+        input_messages, _ = serving_chat._make_request_with_harmony(req)
        verify_harmony_messages(
            input_messages,
            [
@ -1133,7 +1133,7 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the second turn's input
        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
-        input_messages_2, _, _ = serving_chat._make_request_with_harmony(req_2)
+        input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
        verify_harmony_messages(
            input_messages_2,
            [
@ -1183,7 +1183,7 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the third turn's input
        req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
-        input_messages_3, _, _ = serving_chat._make_request_with_harmony(req_3)
+        input_messages_3, _ = serving_chat._make_request_with_harmony(req_3)
        verify_harmony_messages(
            input_messages_3,
            [
@ -1246,7 +1246,7 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the fourth turn's input
        req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
-        input_messages_4, _, _ = serving_chat._make_request_with_harmony(req_4)
+        input_messages_4, _ = serving_chat._make_request_with_harmony(req_4)
        verify_harmony_messages(
            input_messages_4,
            [
@ -1295,7 +1295,7 @@ class TestServingChatWithHarmony:
            },
        ]
        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
-        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
+        input_messages, _ = serving_chat._make_request_with_harmony(req)

        verify_harmony_messages(
            input_messages,
@ -1327,7 +1327,7 @@ class TestServingChatWithHarmony:
            },
        ]
        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
-        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
+        input_messages, _ = serving_chat._make_request_with_harmony(req)

        verify_harmony_messages(
            input_messages,
@ -1357,7 +1357,7 @@ class TestServingChatWithHarmony:
            },
        ]
        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
-        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
+        input_messages, _ = serving_chat._make_request_with_harmony(req)

        verify_harmony_messages(
            input_messages,
--- a/tests/entrypoints/openai/test_serving_responses.py
+++ b/tests/entrypoints/openai/test_serving_responses.py
@ -21,7 +21,7 @@ from vllm.entrypoints.openai.serving_responses import (
    extract_tool_types,
 )
 from vllm.entrypoints.tool_server import ToolServer
-from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
+from vllm.inputs.data import TokensPrompt


 class MockConversationContext(ConversationContext):
@ -237,7 +237,7 @@ class TestValidateGeneratorInput:
        """Test _validate_generator_input with valid prompt length"""
        # Create an engine prompt with valid length (less than max_model_len)
        valid_prompt_token_ids = list(range(5))  # 5 tokens < 100 max_model_len
-        engine_prompt = EngineTokensPrompt(prompt_token_ids=valid_prompt_token_ids)
+        engine_prompt = TokensPrompt(prompt_token_ids=valid_prompt_token_ids)

        # Call the method
        result = serving_responses_instance._validate_generator_input(engine_prompt)
@ -247,7 +247,7 @@ class TestValidateGeneratorInput:

        # create an invalid engine prompt
        invalid_prompt_token_ids = list(range(200))  # 100 tokens >= 100 max_model_len
-        engine_prompt = EngineTokensPrompt(prompt_token_ids=invalid_prompt_token_ids)
+        engine_prompt = TokensPrompt(prompt_token_ids=invalid_prompt_token_ids)

        # Call the method
        result = serving_responses_instance._validate_generator_input(engine_prompt)
--- a/tests/kernels/quantization/test_awq.py
+++ b/tests/kernels/quantization/test_awq.py
@ -41,9 +41,9 @@ def test_awq_gemm_opcheck(monkeypatch: pytest.MonkeyPatch):
        qweight = torch.randint(
            -2000000000, 2000000000, (8192, 256), device="cuda", dtype=torch.int32
        )
-        scales = torch.randint(
+        scales = torch.empty((64, 2048), device="cuda", dtype=torch.float16)
+        qzeros = torch.randint(
            -2000000000, 2000000000, (64, 256), device="cuda", dtype=torch.int32
        )
-        qzeros = torch.empty((64, 2048), device="cuda", dtype=torch.float16)
        split_k_iters = 8
-        opcheck(torch.ops._C.awq_gemm, (input, qweight, qzeros, scales, split_k_iters))
+        opcheck(torch.ops._C.awq_gemm, (input, qweight, scales, qzeros, split_k_iters))
--- a/tests/models/fixtures/audioflamingo3/expected_results_batched.json
+++ b/tests/models/fixtures/audioflamingo3/expected_results_batched.json
@ -0,0 +1 @@
+{"transcriptions": ["There is no clear relationship between the barking and the music, as they seem to be independent of each other.", "(B) To indicate that language cannot express clearly, satirizing the inversion of black and white in the world"], "token_ids": [[3862, 374, 902, 2797, 5025, 1948, 279, 293, 33452, 323, 279, 4627, 11, 438, 807, 2803, 311, 387, 9489, 315, 1817, 1008, 13, 151645], [5349, 8, 2014, 13216, 429, 4128, 4157, 3158, 9355, 11, 7578, 404, 4849, 279, 46488, 315, 3691, 323, 4158, 304, 279, 1879, 151645, 151671]]}
--- a/tests/models/fixtures/audioflamingo3/expected_results_single.json
+++ b/tests/models/fixtures/audioflamingo3/expected_results_single.json
@ -0,0 +1 @@
+{"transcriptions": ["The content of the input audio is 'you can ask why over and over and over again forever even if one day we explain every physical interaction and scientific law and hope and dream and regret with a single elegant equation'."], "token_ids": [[785, 2213, 315, 279, 1946, 7699, 374, 364, 9330, 646, 2548, 3170, 916, 323, 916, 323, 916, 1549, 15683, 1496, 421, 825, 1899, 582, 10339, 1449, 6961, 16230, 323, 12344, 2329, 323, 3900, 323, 7904, 323, 22231, 448, 264, 3175, 25777, 23606, 4427, 151645]]}
--- a/tests/models/multimodal/generation/test_audioflamingo3.py
+++ b/tests/models/multimodal/generation/test_audioflamingo3.py
@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The vLLM team.
+# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+
+import pytest
+
+from tests.models.registry import HF_EXAMPLE_MODELS
+from vllm import LLM, SamplingParams
+
+MODEL_NAME = "nvidia/audio-flamingo-3-hf"
+
+
+def get_fixture_path(filename):
+    return os.path.join(
+        os.path.dirname(__file__), "../../fixtures/audioflamingo3", filename
+    )
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # Check if the model is supported by the current transformers version
+    model_info = HF_EXAMPLE_MODELS.get_hf_info("AudioFlamingo3ForConditionalGeneration")
+    model_info.check_transformers_version(on_fail="skip")
+
+    try:
+        llm = LLM(
+            model=MODEL_NAME,
+            trust_remote_code=True,
+            dtype="bfloat16",
+            enforce_eager=True,
+            limit_mm_per_prompt={"audio": 1},
+        )
+        return llm
+    except Exception as e:
+        pytest.skip(f"Failed to load model {MODEL_NAME}: {e}")
+
+
+def test_single_generation(llm):
+    fixture_path = get_fixture_path("expected_results_single.json")
+    if not os.path.exists(fixture_path):
+        pytest.skip(f"Fixture not found: {fixture_path}")
+
+    with open(fixture_path) as f:
+        expected = json.load(f)
+
+    audio_url = "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Why_do_we_ask_questions_converted.wav"
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "audio_url", "audio_url": {"url": audio_url}},
+                {"type": "text", "text": "Transcribe the input speech."},
+            ],
+        }
+    ]
+
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
+
+    outputs = llm.chat(
+        messages=messages,
+        sampling_params=sampling_params,
+    )
+    generated_text = outputs[0].outputs[0].text.strip()
+
+    expected_text = expected["transcriptions"][0]
+
+    assert expected_text in generated_text or generated_text in expected_text
+
+
+def test_batched_generation(llm):
+    fixture_path = get_fixture_path("expected_results_batched.json")
+    if not os.path.exists(fixture_path):
+        pytest.skip(f"Fixture not found: {fixture_path}")
+
+    with open(fixture_path) as f:
+        expected = json.load(f)
+
+    items = [
+        {
+            "audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/dogs_barking_in_sync_with_the_music.wav",
+            "question": "What is surprising about the relationship "
+            "between the barking and the music?",
+            "expected_idx": 0,
+        },
+        {
+            "audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Ch6Ae9DT6Ko_00-04-03_00-04-31.wav",
+            "question": (
+                "Why is the philosopher's name mentioned in the lyrics? "
+                "(A) To express a sense of nostalgia "
+                "(B) To indicate that language cannot express clearly, "
+                "satirizing the inversion of black and white in the world "
+                "(C) To add depth and complexity to the lyrics "
+                "(D) To showcase the wisdom and influence of the philosopher"
+            ),
+            "expected_idx": 1,
+        },
+    ]
+
+    conversations = []
+    for item in items:
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "audio_url", "audio_url": {"url": item["audio_url"]}},
+                    {"type": "text", "text": item["question"]},
+                ],
+            }
+        ]
+        conversations.append(messages)
+
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
+
+    outputs = llm.chat(
+        messages=conversations,
+        sampling_params=sampling_params,
+    )
+
+    for i, output in enumerate(outputs):
+        generated_text = output.outputs[0].text.strip()
+        expected_text = expected["transcriptions"][i]
+
+        assert expected_text in generated_text or generated_text in expected_text
--- a/tests/models/multimodal/processing/test_audioflamingo3.py
+++ b/tests/models/multimodal/processing/test_audioflamingo3.py
@ -0,0 +1,125 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The vLLM team.
+# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from unittest.mock import MagicMock
+
+import numpy as np
+import pytest
+import torch
+from transformers import PretrainedConfig
+
+from tests.models.registry import HF_EXAMPLE_MODELS
+
+
+class MockAudioFlamingo3Config(PretrainedConfig):
+    model_type = "audioflamingo3"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.audio_config = PretrainedConfig()
+        self.text_config = PretrainedConfig()
+
+
+class MockAudioFlamingo3Processor:
+    def __init__(self):
+        self.audio_token = "<sound>"
+        self.audio_token_id = 12345
+        self.feature_extractor = MockFeatureExtractor()
+
+    def __call__(self, text=None, audios=None, **kwargs):
+        return {"input_ids": [1, 2, 3], "input_features": [np.zeros((3000, 80))]}
+
+
+class MockFeatureExtractor:
+    def __init__(self):
+        self.sampling_rate = 16000
+        self.chunk_length = 30
+
+
+@pytest.fixture
+def mock_ctx():
+    config = MockAudioFlamingo3Config()
+
+    ctx = MagicMock()
+    ctx.get_hf_config.return_value = config
+    ctx.get_hf_processor.return_value = MockAudioFlamingo3Processor()
+    ctx.model_config.hf_config = config
+    return ctx
+
+
+@pytest.fixture(autouse=True)
+def check_transformers_version():
+    # Check if the model is supported by the current transformers version
+    model_info = HF_EXAMPLE_MODELS.get_hf_info("AudioFlamingo3ForConditionalGeneration")
+    model_info.check_transformers_version(on_fail="skip")
+
+
+def test_audio_chunk_counting(mock_ctx):
+    from vllm.model_executor.models.audioflamingo3 import (
+        AudioFlamingo3DummyInputsBuilder,
+        AudioFlamingo3MultiModalProcessor,
+        AudioFlamingo3ProcessingInfo,
+    )
+
+    info = AudioFlamingo3ProcessingInfo(mock_ctx)
+    processor = AudioFlamingo3MultiModalProcessor(
+        info, AudioFlamingo3DummyInputsBuilder(info)
+    )
+
+    sr = 16000
+    audio_1 = np.zeros(30 * sr)
+    audio_2 = np.zeros(45 * sr)
+
+    mm_data = {"audio": [audio_1, audio_2]}
+    prompt = "<|user|>Listen.<|end|>"
+
+    from vllm.multimodal.processing import BaseMultiModalProcessor
+
+    def mock_base_call(self, prompt, mm_data, mm_kwargs, tok_kwargs):
+        return {"input_ids": [1, 2, 3], "input_features": torch.randn(1, 80, 3000)}
+
+    with pytest.MonkeyPatch.context() as mp:
+        mp.setattr(BaseMultiModalProcessor, "_call_hf_processor", mock_base_call)
+
+        processed = processor._call_hf_processor(prompt, mm_data, {}, {})
+
+        chunk_counts = processed["chunk_counts"]
+
+        assert chunk_counts[0].item() == 1
+        assert chunk_counts[1].item() == 2
+        assert len(chunk_counts) == 2
+
+
+def test_dummy_data_generation(mock_ctx):
+    from vllm.model_executor.models.audioflamingo3 import (
+        AudioFlamingo3DummyInputsBuilder,
+        AudioFlamingo3ProcessingInfo,
+    )
+
+    info = AudioFlamingo3ProcessingInfo(mock_ctx)
+    builder = AudioFlamingo3DummyInputsBuilder(info)
+
+    mm_counts = {"audio": 2}
+    dummy_data = builder.get_dummy_mm_data(100, mm_counts, None)
+
+    assert "audio" in dummy_data
+    assert len(dummy_data["audio"]) == 2
+
+    expected_len = 600 * 16000
+    assert len(dummy_data["audio"][0]) == expected_len
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -578,6 +578,9 @@ _AUTOMATIC_CONVERTED_MODELS = {
 _MULTIMODAL_EXAMPLE_MODELS = {
    # [Decoder-only]
    "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
+    "AudioFlamingo3ForConditionalGeneration": _HfExamplesInfo(
+        "nvidia/audio-flamingo-3-hf", min_transformers_version="5.0.0.dev"
+    ),
    "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"),
    "BeeForConditionalGeneration": _HfExamplesInfo(
        "Open-Bee/Bee-8B-RL",
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@ -34,6 +34,13 @@ INPUTS_SLICES = [
 ]


+# Test that a nested mixed-type list of lists raises a TypeError.
+@pytest.mark.parametrize("invalid_input", [[[1, 2], ["foo", "bar"]]])
+def test_invalid_input_raise_type_error(invalid_input):
+    with pytest.raises(TypeError):
+        parse_raw_prompts(invalid_input)
+
+
 def test_parse_raw_single_batch_empty():
    with pytest.raises(ValueError, match="at least one prompt"):
        parse_raw_prompts([])
--- a/tests/v1/entrypoints/conftest.py
+++ b/tests/v1/entrypoints/conftest.py
@ -76,6 +76,8 @@ def sample_json_schema():
        },
        "required": ["name", "age", "skills", "grade", "email", "work_history"],
        "additionalProperties": False,
+        "minProperties": 1,
+        "maxProperties": 10,
    }


@ -96,6 +98,9 @@ def unsupported_json_schema():
        },
        "required": ["score", "tags"],
        "additionalProperties": False,
+        "patternProperties": {
+            "^score$": {"type": "integer"},
+        },
    }


--- a/tests/v1/structured_output/test_utils.py
+++ b/tests/v1/structured_output/test_utils.py
@ -44,8 +44,6 @@ def unsupported_array_schemas():
@pytest.fixture
 def unsupported_object_schemas():
    return [
-        {"type": "object", "minProperties": 1},
-        {"type": "object", "maxProperties": 5},
        {"type": "object", "propertyNames": {"pattern": "^[a-z]+$"}},
        {"type": "object", "patternProperties": {"^S": {"type": "string"}}},
    ]
@ -79,6 +77,8 @@ def supported_schema():
                },
            },
        },
+        "minProperties": 1,
+        "maxProperties": 100,
    }


--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@ -498,15 +498,15 @@ def awq_dequantize(
 def awq_gemm(
    input: torch.Tensor,
    qweight: torch.Tensor,
-    qzeros: torch.Tensor,
    scales: torch.Tensor,
+    qzeros: torch.Tensor,
    split_k_iters: int,
 ) -> torch.Tensor:
    if envs.VLLM_USE_TRITON_AWQ:
        from vllm.model_executor.layers.quantization.awq_triton import awq_gemm_triton

-        return awq_gemm_triton(input, qweight, qzeros, scales, split_k_iters)
-    return torch.ops._C.awq_gemm(input, qweight, qzeros, scales, split_k_iters)
+        return awq_gemm_triton(input, qweight, scales, qzeros, split_k_iters)
+    return torch.ops._C.awq_gemm(input, qweight, scales, qzeros, split_k_iters)


 # gptq
@ -632,8 +632,8 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
    def _awq_gemm_fake(
        input: torch.Tensor,
        qweight: torch.Tensor,
-        qzeros: torch.Tensor,
        scales: torch.Tensor,
+        qzeros: torch.Tensor,
        split_k_iters: torch.SymInt,
    ) -> torch.Tensor:
        num_in_feats = input.size(0)
--- a/vllm/benchmarks/startup.py
+++ b/vllm/benchmarks/startup.py
@ -0,0 +1,326 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Benchmark the cold and warm startup time of vLLM models.
+
+This script measures total startup time (including model loading, compilation,
+and cache operations) for both cold and warm scenarios:
+- Cold startup: Fresh start with no caches (temporary cache directories)
+- Warm startup: Using cached compilation and model info
+"""
+
+import argparse
+import dataclasses
+import json
+import multiprocessing
+import os
+import shutil
+import tempfile
+import time
+from contextlib import contextmanager
+from typing import Any
+
+import numpy as np
+from tqdm import tqdm
+
+from vllm.benchmarks.lib.utils import (
+    convert_to_pytorch_benchmark_format,
+    write_to_json,
+)
+from vllm.engine.arg_utils import EngineArgs
+
+
+@contextmanager
+def cold_startup():
+    """
+    Context manager to measure cold startup time:
+    1. Uses a temporary directory for vLLM cache to avoid any pollution
+       between cold startup iterations.
+    2. Uses inductor's fresh_cache to clear torch.compile caches.
+    """
+    from torch._inductor.utils import fresh_cache
+
+    # Use temporary directory for caching to avoid any pollution between cold startups
+    original_cache_root = os.environ.get("VLLM_CACHE_ROOT")
+    temp_cache_dir = tempfile.mkdtemp(prefix="vllm_startup_bench_cold_")
+    try:
+        os.environ["VLLM_CACHE_ROOT"] = temp_cache_dir
+        with fresh_cache():
+            yield
+    finally:
+        # Clean up temporary cache directory
+        shutil.rmtree(temp_cache_dir, ignore_errors=True)
+        if original_cache_root:
+            os.environ["VLLM_CACHE_ROOT"] = original_cache_root
+        else:
+            os.environ.pop("VLLM_CACHE_ROOT", None)
+
+
+def run_startup_in_subprocess(engine_args_dict, result_queue):
+    """
+    Run LLM startup in a subprocess and return timing metrics via a queue.
+    This ensures complete isolation between iterations.
+    """
+    try:
+        # Import inside the subprocess to avoid issues with forking
+        from vllm import LLM
+        from vllm.engine.arg_utils import EngineArgs
+
+        engine_args = EngineArgs(**engine_args_dict)
+
+        # Measure total startup time
+        start_time = time.perf_counter()
+
+        llm = LLM(**dataclasses.asdict(engine_args))
+
+        total_startup_time = time.perf_counter() - start_time
+
+        # Extract compilation time if available
+        compilation_time = 0.0
+        if hasattr(llm.llm_engine, "vllm_config"):
+            vllm_config = llm.llm_engine.vllm_config
+            if (
+                hasattr(vllm_config, "compilation_config")
+                and vllm_config.compilation_config is not None
+            ):
+                compilation_time = vllm_config.compilation_config.compilation_time
+
+        result_queue.put(
+            {
+                "total_startup_time": total_startup_time,
+                "compilation_time": compilation_time,
+            }
+        )
+
+    except Exception as e:
+        result_queue.put(None)
+        result_queue.put(str(e))
+
+
+def save_to_pytorch_benchmark_format(
+    args: argparse.Namespace, results: dict[str, Any]
+) -> None:
+    base_name = os.path.splitext(args.output_json)[0]
+
+    cold_startup_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={
+            "avg_cold_startup_time": results["avg_cold_startup_time"],
+        },
+        extra_info={
+            "cold_startup_times": results["cold_startup_times"],
+            "cold_startup_percentiles": results["cold_startup_percentiles"],
+        },
+    )
+    if cold_startup_records:
+        write_to_json(f"{base_name}.cold_startup.pytorch.json", cold_startup_records)
+
+    cold_compilation_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={
+            "avg_cold_compilation_time": results["avg_cold_compilation_time"],
+        },
+        extra_info={
+            "cold_compilation_times": results["cold_compilation_times"],
+            "cold_compilation_percentiles": results["cold_compilation_percentiles"],
+        },
+    )
+    if cold_compilation_records:
+        write_to_json(
+            f"{base_name}.cold_compilation.pytorch.json", cold_compilation_records
+        )
+
+    warm_startup_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={
+            "avg_warm_startup_time": results["avg_warm_startup_time"],
+        },
+        extra_info={
+            "warm_startup_times": results["warm_startup_times"],
+            "warm_startup_percentiles": results["warm_startup_percentiles"],
+        },
+    )
+    if warm_startup_records:
+        write_to_json(f"{base_name}.warm_startup.pytorch.json", warm_startup_records)
+
+    warm_compilation_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={
+            "avg_warm_compilation_time": results["avg_warm_compilation_time"],
+        },
+        extra_info={
+            "warm_compilation_times": results["warm_compilation_times"],
+            "warm_compilation_percentiles": results["warm_compilation_percentiles"],
+        },
+    )
+    if warm_compilation_records:
+        write_to_json(
+            f"{base_name}.warm_compilation.pytorch.json", warm_compilation_records
+        )
+
+
+def add_cli_args(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--num-iters-cold",
+        type=int,
+        default=5,
+        help="Number of cold startup iterations.",
+    )
+    parser.add_argument(
+        "--num-iters-warmup",
+        type=int,
+        default=3,
+        help="Number of warmup iterations before benchmarking warm startups.",
+    )
+    parser.add_argument(
+        "--num-iters-warm",
+        type=int,
+        default=5,
+        help="Number of warm startup iterations.",
+    )
+    parser.add_argument(
+        "--output-json",
+        type=str,
+        default=None,
+        help="Path to save the startup time results in JSON format.",
+    )
+
+    parser = EngineArgs.add_cli_args(parser)
+    return parser
+
+
+def main(args: argparse.Namespace):
+    # Set multiprocessing start method to 'spawn' for clean process isolation
+    # This ensures each subprocess starts fresh without inheriting state
+    multiprocessing.set_start_method("spawn", force=True)
+
+    engine_args = EngineArgs.from_cli_args(args)
+
+    def create_llm_and_measure_startup():
+        """
+        Create LLM instance in a subprocess and measure startup time.
+        Returns timing metrics, using subprocess for complete isolation.
+        """
+        # Convert engine_args to dictionary for pickling
+        engine_args_dict = dataclasses.asdict(engine_args)
+
+        # Create a queue for inter-process communication
+        result_queue = multiprocessing.Queue()
+        process = multiprocessing.Process(
+            target=run_startup_in_subprocess,
+            args=(
+                engine_args_dict,
+                result_queue,
+            ),
+        )
+        process.start()
+        process.join()
+
+        if not result_queue.empty():
+            result = result_queue.get()
+            if result is None:
+                if not result_queue.empty():
+                    error_msg = result_queue.get()
+                    raise RuntimeError(f"Subprocess failed: {error_msg}")
+                else:
+                    raise RuntimeError("Subprocess failed with unknown error")
+            return result
+        else:
+            raise RuntimeError("Subprocess did not return a result")
+
+    os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
+    print("Setting VLLM_ENABLE_V1_MULTIPROCESSING=0 to collect startup metrics.\n")
+
+    print("Measuring cold startup time...\n")
+    cold_startup_times = []
+    cold_compilation_times = []
+    for i in tqdm(range(args.num_iters_cold), desc="Cold startup iterations"):
+        with cold_startup():
+            metrics = create_llm_and_measure_startup()
+            cold_startup_times.append(metrics["total_startup_time"])
+            cold_compilation_times.append(metrics["compilation_time"])
+
+    # Warmup for warm startup
+    print("\nWarming up for warm startup measurement...\n")
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        create_llm_and_measure_startup()
+
+    print("\nMeasuring warm startup time...\n")
+    warm_startup_times = []
+    warm_compilation_times = []
+    for i in tqdm(range(args.num_iters_warm), desc="Warm startup iterations"):
+        metrics = create_llm_and_measure_startup()
+        warm_startup_times.append(metrics["total_startup_time"])
+        warm_compilation_times.append(metrics["compilation_time"])
+
+    # Calculate statistics
+    cold_startup_array = np.array(cold_startup_times)
+    cold_compilation_array = np.array(cold_compilation_times)
+    warm_startup_array = np.array(warm_startup_times)
+    warm_compilation_array = np.array(warm_compilation_times)
+
+    avg_cold_startup = np.mean(cold_startup_array)
+    avg_cold_compilation = np.mean(cold_compilation_array)
+    avg_warm_startup = np.mean(warm_startup_array)
+    avg_warm_compilation = np.mean(warm_compilation_array)
+
+    percentages = [10, 25, 50, 75, 90, 99]
+    cold_startup_percentiles = np.percentile(cold_startup_array, percentages)
+    cold_compilation_percentiles = np.percentile(cold_compilation_array, percentages)
+    warm_startup_percentiles = np.percentile(warm_startup_array, percentages)
+    warm_compilation_percentiles = np.percentile(warm_compilation_array, percentages)
+
+    print("\n" + "=" * 60)
+    print("STARTUP TIME BENCHMARK RESULTS")
+    print("=" * 60)
+
+    # Cold startup statistics
+    print("\nCOLD STARTUP:")
+    print(f"Avg total startup time: {avg_cold_startup:.2f} seconds")
+    print(f"Avg compilation time:   {avg_cold_compilation:.2f} seconds")
+    print("Startup time percentiles:")
+    for percentage, percentile in zip(percentages, cold_startup_percentiles):
+        print(f"  {percentage}%: {percentile:.2f} seconds")
+    print("Compilation time percentiles:")
+    for percentage, percentile in zip(percentages, cold_compilation_percentiles):
+        print(f"  {percentage}%: {percentile:.2f} seconds")
+
+    # Warm startup statistics
+    print("\nWARM STARTUP:")
+    print(f"Avg total startup time: {avg_warm_startup:.2f} seconds")
+    print(f"Avg compilation time:   {avg_warm_compilation:.2f} seconds")
+    print("Startup time percentiles:")
+    for percentage, percentile in zip(percentages, warm_startup_percentiles):
+        print(f"  {percentage}%: {percentile:.2f} seconds")
+    print("Compilation time percentiles:")
+    for percentage, percentile in zip(percentages, warm_compilation_percentiles):
+        print(f"  {percentage}%: {percentile:.2f} seconds")
+
+    print("=" * 60)
+
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "avg_cold_startup_time": float(avg_cold_startup),
+            "avg_cold_compilation_time": float(avg_cold_compilation),
+            "cold_startup_times": cold_startup_times,
+            "cold_compilation_times": cold_compilation_times,
+            "cold_startup_percentiles": dict(
+                zip(percentages, cold_startup_percentiles.tolist())
+            ),
+            "cold_compilation_percentiles": dict(
+                zip(percentages, cold_compilation_percentiles.tolist())
+            ),
+            "avg_warm_startup_time": float(avg_warm_startup),
+            "avg_warm_compilation_time": float(avg_warm_compilation),
+            "warm_startup_times": warm_startup_times,
+            "warm_compilation_times": warm_compilation_times,
+            "warm_startup_percentiles": dict(
+                zip(percentages, warm_startup_percentiles.tolist())
+            ),
+            "warm_compilation_percentiles": dict(
+                zip(percentages, warm_compilation_percentiles.tolist())
+            ),
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+        save_to_pytorch_benchmark_format(args, results)
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@ -463,21 +463,27 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
 # the tag for the part of model being compiled,
 # e.g. backbone/eagle_head
 model_tag: str = "backbone"
+model_is_encoder: bool = False


@contextmanager
-def set_model_tag(tag: str):
+def set_model_tag(tag: str, is_encoder: bool = False):
    """Context manager to set the model tag."""
    global model_tag
+    global model_is_encoder
    assert tag != model_tag, (
        f"Model tag {tag} is the same as the current tag {model_tag}."
    )
    old_tag = model_tag
+    old_is_encoder = model_is_encoder
+
    model_tag = tag
+    model_is_encoder = is_encoder
    try:
        yield
    finally:
        model_tag = old_tag
+        model_is_encoder = old_is_encoder


 class VllmBackend:
@ -523,6 +529,9 @@ class VllmBackend:
        # them, e.g. backbone (default), eagle_head, etc.
        self.prefix = prefix or model_tag

+        # Mark compilation for encoder.
+        self.is_encoder = model_is_encoder
+
        # Passes to run on the graph post-grad.
        self.pass_manager = resolve_obj_by_qualname(
            current_platform.get_pass_manager_cls()
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@ -390,14 +390,6 @@ def _support_torch_compile(
            serialized backend artifacts), then we need to generate a new AOT
            compile artifact from scratch.
            """
-            # Validate that AOT compile is not used with unbacked dynamic
-            # shapes. aot_compile re-allocates backed symbols post dynamo!
-            if ds_type == DynamicShapesType.UNBACKED:
-                raise ValueError(
-                    "AOT compilation is not compatible with UNBACKED dynamic shapes. "
-                    "Please use BACKED or BACKED_SIZE_OBLIVIOUS dynamic shapes type "
-                    "when VLLM_USE_AOT_COMPILE is enabled."
-                )
            from .caching import compilation_config_hash_factors

            factors: list[str] = compilation_config_hash_factors(self.vllm_config)
--- a/vllm/compilation/piecewise_backend.py
+++ b/vllm/compilation/piecewise_backend.py
@ -53,12 +53,7 @@ class PiecewiseBackend:
        self.is_last_graph = piecewise_compile_index == total_piecewise_compiles - 1

        self.is_full_graph = total_piecewise_compiles == 1
-        # TODO: we need to generalize encoder compilation to other models
-        self.is_encoder_compilation = vllm_backend.prefix in [
-            "Qwen2_5_VisionPatchEmbed",
-            "Qwen2_5_VisionPatchMerger",
-            "Qwen2_5_VisionBlock",
-        ]
+        self.is_encoder_compilation = vllm_backend.is_encoder

        self.compile_ranges = self.compilation_config.get_compile_ranges()
        if self.is_encoder_compilation:
--- a/vllm/distributed/eplb/rebalance_execute.py
+++ b/vllm/distributed/eplb/rebalance_execute.py
@ -322,9 +322,6 @@ async def transfer_layer(
    num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
    assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
    assert num_physical_experts == ep_size * num_local_physical_experts
-    # A buffer to hold the expert weights in one layer during the exchange.
-    # NOTE: Currently we assume the same weights across different layers
-    # have the same shape.

    is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
        num_local_experts=num_local_physical_experts,
--- a/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
@ -7,7 +7,6 @@ from prometheus_client import Counter, Gauge, Histogram

 from vllm.config import KVTransferConfig, VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
-from vllm.distributed.kv_transfer.kv_transfer_state import has_kv_transfer_group
 from vllm.logger import init_logger

 PromMetric: TypeAlias = Gauge | Counter | Histogram
@ -53,8 +52,6 @@ class KVConnectorStats:

 class KVConnectorLogging:
    def __init__(self, kv_transfer_config: KVTransferConfig | None):
-        # This should be called on frontend process.
-        assert not has_kv_transfer_group()
        # Instantiate the connector's stats class.
        if kv_transfer_config and kv_transfer_config.kv_connector:
            self.connector_cls = KVConnectorFactory.get_connector_class(
--- a/vllm/entrypoints/cli/init.py
+++ b/vllm/entrypoints/cli/init.py
@ -2,12 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.entrypoints.cli.benchmark.latency import BenchmarkLatencySubcommand
 from vllm.entrypoints.cli.benchmark.serve import BenchmarkServingSubcommand
+from vllm.entrypoints.cli.benchmark.startup import BenchmarkStartupSubcommand
 from vllm.entrypoints.cli.benchmark.sweep import BenchmarkSweepSubcommand
 from vllm.entrypoints.cli.benchmark.throughput import BenchmarkThroughputSubcommand

 __all__: list[str] = [
    "BenchmarkLatencySubcommand",
    "BenchmarkServingSubcommand",
+    "BenchmarkStartupSubcommand",
    "BenchmarkSweepSubcommand",
    "BenchmarkThroughputSubcommand",
 ]
--- a/vllm/entrypoints/cli/benchmark/startup.py
+++ b/vllm/entrypoints/cli/benchmark/startup.py
@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+
+from vllm.benchmarks.startup import add_cli_args, main
+from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
+
+
+class BenchmarkStartupSubcommand(BenchmarkSubcommandBase):
+    """The `startup` subcommand for `vllm bench`."""
+
+    name = "startup"
+    help = "Benchmark the startup time of vLLM models."
+
+    @classmethod
+    def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
+        add_cli_args(parser)
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        main(args)
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@ -61,7 +61,7 @@ from vllm.entrypoints.openai.tool_parsers import ToolParser
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall
 from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls
 from vllm.entrypoints.utils import get_max_tokens, should_include_usage
-from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
+from vllm.inputs.data import TokensPrompt
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob
 from vllm.outputs import CompletionOutput, RequestOutput
@ -234,11 +234,7 @@ class OpenAIServingChat(OpenAIServing):
                )
                if error_check_ret is not None:
                    return error_check_ret
-                (
-                    conversation,
-                    request_prompts,
-                    engine_prompts,
-                ) = await self._preprocess_chat(
+                conversation, engine_prompts = await self._preprocess_chat(
                    request,
                    tokenizer,
                    request.messages,
@ -254,11 +250,7 @@ class OpenAIServingChat(OpenAIServing):
                )
            else:
                # For GPT-OSS.
-                (
-                    conversation,
-                    request_prompts,
-                    engine_prompts,
-                ) = self._make_request_with_harmony(request)
+                conversation, engine_prompts = self._make_request_with_harmony(request)
        except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e:
            logger.exception("Error in preprocessing prompt inputs")
            return self.create_error_response(f"{e} {e.__cause__}")
@ -278,7 +270,7 @@ class OpenAIServingChat(OpenAIServing):
        generators: list[AsyncGenerator[RequestOutput, None]] = []
        try:
            for i, engine_prompt in enumerate(engine_prompts):
-                prompt_text, _, _ = self._get_prompt_components(request_prompts[i])
+                prompt_text, _, _ = self._get_prompt_components(engine_prompt)
                # If we are creating sub requests for multiple prompts, ensure that they
                # have unique request ids.
                sub_request_id = (
@ -313,7 +305,7 @@ class OpenAIServingChat(OpenAIServing):

                self._log_inputs(
                    sub_request_id,
-                    request_prompts[i],
+                    engine_prompt,
                    params=sampling_params,
                    lora_request=lora_request,
                )
@ -537,7 +529,7 @@ class OpenAIServingChat(OpenAIServing):
        request_id: str,
        model_name: str,
        conversation: list[ConversationMessage],
-        tokenizer: TokenizerLike,
+        tokenizer: TokenizerLike | None,
        request_metadata: RequestResponseMetadata,
    ) -> AsyncGenerator[str, None]:
        created_time = int(time.time())
@ -591,6 +583,11 @@ class OpenAIServingChat(OpenAIServing):

        try:
            if self.reasoning_parser:
+                if tokenizer is None:
+                    raise ValueError(
+                        "Tokenizer not available when `skip_tokenizer_init=True`"
+                    )
+
                reasoning_parser = self.reasoning_parser(
                    tokenizer,
                    chat_template_kwargs=request.chat_template_kwargs,  # type: ignore
@ -604,6 +601,11 @@ class OpenAIServingChat(OpenAIServing):
        # Prepare the tool parser if it's needed
        try:
            if tool_choice_auto and self.tool_parser:
+                if tokenizer is None:
+                    raise ValueError(
+                        "Tokenizer not available when `skip_tokenizer_init=True`"
+                    )
+
                tool_parsers: list[ToolParser | None] = [
                    self.tool_parser(tokenizer)
                ] * num_choices
@ -1317,7 +1319,7 @@ class OpenAIServingChat(OpenAIServing):
        request_id: str,
        model_name: str,
        conversation: list[ConversationMessage],
-        tokenizer: TokenizerLike,
+        tokenizer: TokenizerLike | None,
        request_metadata: RequestResponseMetadata,
    ) -> ErrorResponse | ChatCompletionResponse:
        created_time = int(time.time())
@ -1367,6 +1369,11 @@ class OpenAIServingChat(OpenAIServing):
                    reasoning = None

                if self.tool_parser is not None:
+                    if tokenizer is None:
+                        raise ValueError(
+                            "Tokenizer not available when `skip_tokenizer_init=True`"
+                        )
+
                    tool_parser = self.tool_parser(tokenizer)
                    # NOTE: We use token_ids for openai tool parser
                    tool_call_info = tool_parser.extract_tool_calls(
@ -1409,6 +1416,11 @@ class OpenAIServingChat(OpenAIServing):

            if self.reasoning_parser:
                try:
+                    if tokenizer is None:
+                        raise ValueError(
+                            "Tokenizer not available when `skip_tokenizer_init=True`"
+                        )
+
                    reasoning_parser = self.reasoning_parser(
                        tokenizer,
                        chat_template_kwargs=request.chat_template_kwargs,  # type: ignore
@ -1648,7 +1660,7 @@ class OpenAIServingChat(OpenAIServing):
        self,
        logprobs: dict[int, Logprob],
        top_logprobs: int | None,
-        tokenizer: TokenizerLike,
+        tokenizer: TokenizerLike | None,
        should_return_as_token_id: bool,
    ) -> list[ChatCompletionLogProb]:
        return [
@ -1672,7 +1684,7 @@ class OpenAIServingChat(OpenAIServing):
        self,
        token_ids: GenericSequence[int],
        top_logprobs: GenericSequence[dict[int, Logprob] | None],
-        tokenizer: TokenizerLike,
+        tokenizer: TokenizerLike | None,
        num_output_top_logprobs: int | None = None,
        return_as_token_id: bool | None = None,
    ) -> ChatCompletionLogProbs:
@ -1690,6 +1702,11 @@ class OpenAIServingChat(OpenAIServing):
                if should_return_as_token_id:
                    token = f"token_id:{token_id}"
                else:
+                    if tokenizer is None:
+                        raise ValueError(
+                            "Tokenizer not available when `skip_tokenizer_init=True`"
+                        )
+
                    token = tokenizer.decode(token_id)

                logprobs_content.append(
@ -1800,10 +1817,10 @@ class OpenAIServingChat(OpenAIServing):

        # Render prompt token ids.
        prompt_token_ids = render_for_completion(messages)
-        engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
+        engine_prompt = TokensPrompt(prompt_token_ids=prompt_token_ids)

        # Add cache_salt if provided in the request
        if request.cache_salt is not None:
            engine_prompt["cache_salt"] = request.cache_salt

-        return messages, [prompt_token_ids], [engine_prompt]
+        return messages, [engine_prompt]
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@ -5,29 +5,61 @@ import json
 import sys
 import time
 import traceback
-from collections.abc import AsyncGenerator, Callable, Iterable, Mapping, Sequence
+from collections.abc import AsyncGenerator, Callable, Iterable, Mapping
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass, field
 from http import HTTPStatus
 from typing import Any, ClassVar, Generic, TypeAlias, TypeVar

 import numpy as np
-import torch
 from fastapi import Request
+from openai.types.responses import (
+    ToolChoiceFunction,
+)
 from pydantic import ConfigDict, TypeAdapter
 from starlette.datastructures import Headers
-from typing_extensions import TypeIs

+import vllm.envs as envs
+from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import (
+    ChatCompletionMessageParam,
+    ChatTemplateContentFormatOption,
+    ConversationMessage,
+    apply_hf_chat_template,
+    apply_mistral_chat_template,
+    parse_chat_messages_futures,
+    resolve_chat_template_content_format,
+)
 from vllm.entrypoints.context import (
+    ConversationContext,
    HarmonyContext,
    ParsableContext,
    StreamingHarmonyContext,
 )
+from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
+    ChatCompletionNamedToolChoiceParam,
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    CompletionRequest,
+    CompletionResponse,
+    DetokenizeRequest,
+    ErrorInfo,
+    ErrorResponse,
    FunctionCall,
+    FunctionDefinition,
    ResponseInputOutputItem,
    ResponsesRequest,
+    TokenizeChatRequest,
+    TokenizeCompletionRequest,
+    TokenizeResponse,
+    TranscriptionRequest,
+    TranscriptionResponse,
+    TranslationRequest,
 )
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.entrypoints.pooling.classify.protocol import (
    ClassificationChatRequest,
    ClassificationCompletionRequest,
@ -49,58 +81,13 @@ from vllm.entrypoints.pooling.score.protocol import (
    ScoreRequest,
    ScoreResponse,
 )
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-
-if sys.version_info >= (3, 12):
-    from typing import TypedDict
-else:
-    from typing_extensions import TypedDict
-
-from openai.types.responses import (
-    ToolChoiceFunction,
-)
-
-import vllm.envs as envs
-from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
-from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.chat_utils import (
-    ChatCompletionMessageParam,
-    ChatTemplateContentFormatOption,
-    ConversationMessage,
-    apply_hf_chat_template,
-    apply_mistral_chat_template,
-    parse_chat_messages_futures,
-    resolve_chat_template_content_format,
-)
-from vllm.entrypoints.context import ConversationContext
-from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (
-    ChatCompletionNamedToolChoiceParam,
-    ChatCompletionRequest,
-    ChatCompletionResponse,
-    CompletionRequest,
-    CompletionResponse,
-    DetokenizeRequest,
-    ErrorInfo,
-    ErrorResponse,
-    FunctionDefinition,
-    TokenizeChatRequest,
-    TokenizeCompletionRequest,
-    TokenizeResponse,
-    TranscriptionRequest,
-    TranscriptionResponse,
-    TranslationRequest,
-)
-from vllm.entrypoints.openai.serving_models import OpenAIServingModels
-from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig
 from vllm.entrypoints.responses_utils import (
    construct_input_messages,
 )
 from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse
 from vllm.entrypoints.utils import _validate_truncation_size
-from vllm.inputs.data import PromptType
-from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
+from vllm.inputs.data import PromptType, TokensPrompt
 from vllm.inputs.parse import (
    PromptComponents,
    get_prompt_components,
@ -109,10 +96,7 @@ from vllm.inputs.parse import (
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob, PromptLogprobs
 from vllm.lora.request import LoRARequest
-from vllm.multimodal import (  # noqa: F401 - Required to resolve Pydantic error in RequestProcessingMixin
-    MultiModalDataDict,
-    MultiModalUUIDDict,
-)
+from vllm.multimodal import MultiModalDataDict
 from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
@ -185,34 +169,6 @@ AnyResponse: TypeAlias = (
 )


-class TextTokensPrompt(TypedDict):
-    prompt: str
-    prompt_token_ids: list[int]
-
-
-class EmbedsPrompt(TypedDict):
-    prompt_embeds: torch.Tensor
-
-
-RequestPrompt: TypeAlias = list[int] | str | TextTokensPrompt | EmbedsPrompt
-
-
-def is_text_tokens_prompt(prompt: RequestPrompt) -> TypeIs[TextTokensPrompt]:
-    return (
-        isinstance(prompt, dict)
-        and "prompt_token_ids" in prompt
-        and "prompt_embeds" not in prompt
-    )
-
-
-def is_embeds_prompt(prompt: RequestPrompt) -> TypeIs[EmbedsPrompt]:
-    return (
-        isinstance(prompt, dict)
-        and "prompt_token_ids" not in prompt
-        and "prompt_embeds" in prompt
-    )
-
-
 RequestT = TypeVar("RequestT", bound=AnyRequest)


@ -223,8 +179,7 @@ class RequestProcessingMixin:
    handling prompt preparation and engine input.
    """

-    request_prompts: Sequence[RequestPrompt] | None = field(default_factory=list)
-    engine_prompts: list[EngineTokensPrompt] | None = field(default_factory=list)
+    engine_prompts: list[TokensPrompt] | None = field(default_factory=list)


@dataclass(kw_only=True)
@ -425,7 +380,7 @@ class OpenAIServing:
            prompts_batch, lora_req_batch = zip(
                *[
                    (
-                        EngineTokensPrompt(
+                        TokensPrompt(
                            prompt_token_ids=beam.tokens,
                            multi_modal_data=beam.multi_modal_data,
                            mm_processor_kwargs=beam.mm_processor_kwargs,
@ -947,7 +902,7 @@ class OpenAIServing:
        prompt: str,
        tokenizer: TokenizerLike,
        add_special_tokens: bool,
-    ) -> TextTokensPrompt:
+    ) -> TokensPrompt:
        async_tokenizer = self._get_async_tokenizer(tokenizer)

        if (
@ -988,7 +943,7 @@ class OpenAIServing:
        request: AnyRequest,
        prompt_ids: list[int],
        tokenizer: TokenizerLike | None,
-    ) -> TextTokensPrompt:
+    ) -> TokensPrompt:
        truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", None)

        if truncate_prompt_tokens is None:
@ -1011,7 +966,7 @@ class OpenAIServing:
        request: AnyRequest,
        input_ids: list[int],
        input_text: str,
-    ) -> TextTokensPrompt:
+    ) -> TokensPrompt:
        token_num = len(input_ids)

        # Note: EmbeddingRequest, ClassificationRequest,
@ -1042,7 +997,7 @@ class OpenAIServing:
                    f"{token_num} tokens in the input for {operation}. "
                    f"Please reduce the length of the input."
                )
-            return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
+            return TokensPrompt(prompt=input_text, prompt_token_ids=input_ids)

        # Note: TokenizeRequest and DetokenizeRequest doesn't have max_tokens
        # and does not require model context length validation
@ -1050,7 +1005,7 @@ class OpenAIServing:
            request,
            (TokenizeCompletionRequest, TokenizeChatRequest, DetokenizeRequest),
        ):
-            return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
+            return TokensPrompt(prompt=input_text, prompt_token_ids=input_ids)

        # chat completion endpoint supports max_completion_tokens
        if isinstance(request, ChatCompletionRequest):
@ -1078,7 +1033,7 @@ class OpenAIServing:
                f" - {token_num})."
            )

-        return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
+        return TokensPrompt(prompt=input_text, prompt_token_ids=input_ids)

    async def _tokenize_prompt_input_async(
        self,
@ -1086,7 +1041,7 @@ class OpenAIServing:
        tokenizer: TokenizerLike,
        prompt_input: str | list[int],
        add_special_tokens: bool = True,
-    ) -> TextTokensPrompt:
+    ) -> TokensPrompt:
        """
        A simpler implementation that tokenizes a single prompt input.
        """
@ -1105,7 +1060,7 @@ class OpenAIServing:
        tokenizer: TokenizerLike,
        prompt_inputs: Iterable[str | list[int]],
        add_special_tokens: bool = True,
-    ) -> AsyncGenerator[TextTokensPrompt, None]:
+    ) -> AsyncGenerator[TokensPrompt, None]:
        """
        A simpler implementation that tokenizes multiple prompt inputs.
        """
@ -1158,11 +1113,7 @@ class OpenAIServing:
        chat_template_kwargs: dict[str, Any] | None = None,
        tool_parser: Callable[[TokenizerLike], ToolParser] | None = None,
        add_special_tokens: bool = False,
-    ) -> tuple[
-        list[ConversationMessage],
-        Sequence[RequestPrompt],
-        list[EngineTokensPrompt],
-    ]:
+    ) -> tuple[list[ConversationMessage], list[TokensPrompt]]:
        model_config = self.model_config

        resolved_content_format = resolve_chat_template_content_format(
@ -1235,9 +1186,7 @@ class OpenAIServing:
                "Prompt has to be a string",
                "when the tokenizer is not initialised",
            )
-            prompt_inputs = TextTokensPrompt(
-                prompt=request_prompt, prompt_token_ids=[1]
-            )
+            prompt_inputs = TokensPrompt(prompt=request_prompt, prompt_token_ids=[1])
        elif isinstance(request_prompt, str):
            prompt_inputs = await self._tokenize_prompt_input_async(
                request,
@ -1250,14 +1199,15 @@ class OpenAIServing:
            assert is_list_of(request_prompt, int), (
                "Prompt has to be either a string or a list of token ids"
            )
-            prompt_inputs = TextTokensPrompt(
+            prompt_inputs = TokensPrompt(
                prompt=tokenizer.decode(request_prompt),
                prompt_token_ids=request_prompt,
            )

-        engine_prompt = EngineTokensPrompt(
-            prompt_token_ids=prompt_inputs["prompt_token_ids"]
-        )
+        engine_prompt = TokensPrompt(prompt_token_ids=prompt_inputs["prompt_token_ids"])
+        if "prompt" in prompt_inputs:
+            engine_prompt["prompt"] = prompt_inputs["prompt"]
+
        if mm_data is not None:
            engine_prompt["multi_modal_data"] = mm_data

@ -1270,7 +1220,7 @@ class OpenAIServing:
        if hasattr(request, "cache_salt") and request.cache_salt is not None:
            engine_prompt["cache_salt"] = request.cache_salt

-        return conversation, [request_prompt], [engine_prompt]
+        return conversation, [engine_prompt]

    async def _process_inputs(
        self,
@ -1302,7 +1252,7 @@ class OpenAIServing:
    async def _render_next_turn(
        self,
        request: ResponsesRequest,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike | None,
        messages: list[ResponseInputOutputItem],
        tool_dicts: list[dict[str, Any]] | None,
        tool_parser,
@ -1313,7 +1263,7 @@ class OpenAIServing:
            request_input=messages,
        )

-        _, request_prompts, engine_prompts = await self._preprocess_chat(
+        _, engine_prompts = await self._preprocess_chat(
            request,
            tokenizer,
            new_messages,
@ -1322,20 +1272,20 @@ class OpenAIServing:
            chat_template=chat_template,
            chat_template_content_format=chat_template_content_format,
        )
-        return request_prompts, engine_prompts
+        return engine_prompts

    async def _generate_with_builtin_tools(
        self,
        request_id: str,
-        request_prompt: RequestPrompt,
-        engine_prompt: EngineTokensPrompt,
+        engine_prompt: TokensPrompt,
        sampling_params: SamplingParams,
        context: ConversationContext,
        lora_request: LoRARequest | None = None,
        priority: int = 0,
        **kwargs,
    ):
-        prompt_text, _, _ = self._get_prompt_components(request_prompt)
+        prompt_text, _, _ = self._get_prompt_components(engine_prompt)
+
        orig_priority = priority
        sub_request = 0
        while True:
@ -1343,7 +1293,7 @@ class OpenAIServing:
            sub_request_id = f"{request_id}_{sub_request}"
            self._log_inputs(
                sub_request_id,
-                request_prompt,
+                engine_prompt,
                params=sampling_params,
                lora_request=lora_request,
            )
@ -1388,10 +1338,9 @@ class OpenAIServing:
            # Render the next prompt token ids.
            if isinstance(context, (HarmonyContext, StreamingHarmonyContext)):
                prompt_token_ids = context.render_for_completion()
-                engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
-                request_prompt = prompt_token_ids
+                engine_prompt = TokensPrompt(prompt_token_ids=prompt_token_ids)
            elif isinstance(context, ParsableContext):
-                request_prompts, engine_prompts = await self._render_next_turn(
+                engine_prompts = await self._render_next_turn(
                    context.request,
                    context.tokenizer,
                    context.parser.response_messages,
@ -1401,8 +1350,7 @@ class OpenAIServing:
                    context.chat_template_content_format,
                )
                engine_prompt = engine_prompts[0]
-                request_prompt = request_prompts[0]
-                prompt_text, _, _ = self._get_prompt_components(request_prompt)
+                prompt_text, _, _ = self._get_prompt_components(engine_prompt)

            # Update the sampling params.
            sampling_params.max_tokens = self.max_model_len - len(
@ -1412,19 +1360,13 @@ class OpenAIServing:
            priority = orig_priority - 1
            sub_request += 1

-    def _get_prompt_components(
-        self,
-        prompt: RequestPrompt | PromptType,
-    ) -> PromptComponents:
-        if isinstance(prompt, list):
-            return PromptComponents(token_ids=prompt)
-
-        return get_prompt_components(prompt)  # type: ignore[arg-type]
+    def _get_prompt_components(self, prompt: PromptType) -> PromptComponents:
+        return get_prompt_components(prompt)

    def _log_inputs(
        self,
        request_id: str,
-        inputs: RequestPrompt | PromptType,
+        inputs: PromptType,
        params: SamplingParams | PoolingParams | BeamSearchParams | None,
        lora_request: LoRARequest | None,
    ) -> None:
@ -1486,7 +1428,7 @@ class OpenAIServing:
    @staticmethod
    def _parse_tool_calls_from_content(
        request: ResponsesRequest | ChatCompletionRequest,
-        tokenizer: TokenizerLike,
+        tokenizer: TokenizerLike | None,
        enable_auto_tools: bool,
        tool_parser_cls: Callable[[TokenizerLike], ToolParser] | None,
        content: str | None = None,
@ -1526,6 +1468,11 @@ class OpenAIServing:
            and enable_auto_tools
            and (request.tool_choice == "auto" or request.tool_choice is None)
        ):
+            if tokenizer is None:
+                raise ValueError(
+                    "Tokenizer not available when `skip_tokenizer_init=True`"
+                )
+
            # Automatic Tool Call Parsing
            try:
                tool_parser = tool_parser_cls(tokenizer)
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@ -107,7 +107,7 @@ from vllm.entrypoints.responses_utils import (
    make_response_output_items_from_parsable_context,
 )
 from vllm.entrypoints.tool_server import ToolServer
-from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
+from vllm.inputs.data import TokensPrompt
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob as SampleLogprob
 from vllm.logprobs import SampleLogprobs
@ -258,7 +258,7 @@ class OpenAIServingResponses(OpenAIServing):
        self.tool_server = tool_server

    def _validate_generator_input(
-        self, engine_prompt: EngineTokensPrompt
+        self, engine_prompt: TokensPrompt
    ) -> ErrorResponse | None:
        """Add validations to the input to the generator here."""
        if self.max_model_len <= len(engine_prompt["prompt_token_ids"]):
@ -353,11 +353,11 @@ class OpenAIServingResponses(OpenAIServing):
            tokenizer = await self.engine_client.get_tokenizer()

            if self.use_harmony:
-                messages, request_prompts, engine_prompts = (
-                    self._make_request_with_harmony(request, prev_response)
+                messages, engine_prompts = self._make_request_with_harmony(
+                    request, prev_response
                )
            else:
-                messages, request_prompts, engine_prompts = await self._make_request(
+                messages, engine_prompts = await self._make_request(
                    request, prev_response, tokenizer
                )

@ -393,7 +393,7 @@ class OpenAIServingResponses(OpenAIServing):
            assert len(builtin_tool_list) == 0
            available_tools = []
        try:
-            for i, engine_prompt in enumerate(engine_prompts):
+            for engine_prompt in engine_prompts:
                maybe_error = self._validate_generator_input(engine_prompt)
                if maybe_error is not None:
                    return maybe_error
@ -420,7 +420,7 @@ class OpenAIServingResponses(OpenAIServing):
                        context = HarmonyContext(messages, available_tools)
                else:
                    if envs.VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT:
-                        # This is an feature in development for parsing
+                        # This is a feature in development for parsing
                        # tokens during generation instead of at the end
                        context = ParsableContext(
                            response_messages=messages,
@ -449,7 +449,6 @@ class OpenAIServingResponses(OpenAIServing):
                        )
                generator = self._generate_with_builtin_tools(
                    request_id=request.request_id,
-                    request_prompt=request_prompts[i],
                    engine_prompt=engine_prompt,
                    sampling_params=sampling_params,
                    context=context,
@ -564,7 +563,7 @@ class OpenAIServingResponses(OpenAIServing):
            prev_msg=self.msg_store.get(prev_response.id) if prev_response else None,
            prev_response_output=prev_response.output if prev_response else None,
        )
-        _, request_prompts, engine_prompts = await self._preprocess_chat(
+        _, engine_prompts = await self._preprocess_chat(
            request,
            tokenizer,
            messages,
@ -573,7 +572,7 @@ class OpenAIServingResponses(OpenAIServing):
            chat_template=self.chat_template,
            chat_template_content_format=self.chat_template_content_format,
        )
-        return messages, request_prompts, engine_prompts
+        return messages, engine_prompts

    def _make_request_with_harmony(
        self,
@ -586,13 +585,13 @@ class OpenAIServingResponses(OpenAIServing):
            )
        messages = self._construct_input_messages_with_harmony(request, prev_response)
        prompt_token_ids = render_for_completion(messages)
-        engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
+        engine_prompt = TokensPrompt(prompt_token_ids=prompt_token_ids)

        # Add cache_salt if provided in the request
        if request.cache_salt is not None:
            engine_prompt["cache_salt"] = request.cache_salt

-        return messages, [prompt_token_ids], [engine_prompt]
+        return messages, [engine_prompt]

    async def _initialize_tool_sessions(
        self,
--- a/vllm/entrypoints/pooling/classify/serving.py
+++ b/vllm/entrypoints/pooling/classify/serving.py
@ -72,11 +72,7 @@ class ClassificationMixin(OpenAIServing):
                if ret:
                    return ret

-                (
-                    _,
-                    _,
-                    engine_prompts,
-                ) = await self._preprocess_chat(
+                _, engine_prompts = await self._preprocess_chat(
                    cast(ChatCompletionRequest, chat_request),
                    ctx.tokenizer,
                    messages,
--- a/vllm/entrypoints/pooling/embed/serving.py
+++ b/vllm/entrypoints/pooling/embed/serving.py
@ -20,7 +20,6 @@ from vllm.entrypoints.openai.serving_engine import (
    EmbeddingServeContext,
    OpenAIServing,
    ServeContext,
-    TextTokensPrompt,
 )
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.pooling.embed.protocol import (
@ -32,7 +31,7 @@ from vllm.entrypoints.pooling.embed.protocol import (
    EmbeddingResponseData,
 )
 from vllm.entrypoints.renderer import RenderConfig
-from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
+from vllm.inputs.data import TokensPrompt
 from vllm.logger import init_logger
 from vllm.outputs import (
    EmbeddingRequestOutput,
@ -83,11 +82,7 @@ class EmbeddingMixin(OpenAIServing):
            renderer = self._get_renderer(tokenizer)

            if isinstance(ctx.request, EmbeddingChatRequest):
-                (
-                    _,
-                    _,
-                    ctx.engine_prompts,
-                ) = await self._preprocess_chat(
+                _, ctx.engine_prompts = await self._preprocess_chat(
                    ctx.request,
                    tokenizer,
                    ctx.request.messages,
@ -209,14 +204,13 @@ class EmbeddingMixin(OpenAIServing):
    async def _process_chunked_request(
        self,
        ctx: EmbeddingServeContext,
-        original_prompt: TextTokensPrompt,
+        token_ids: list[int],
        pooling_params,
        trace_headers,
        prompt_idx: int,
    ) -> list[AsyncGenerator[PoolingRequestOutput, None]]:
        """Process a single prompt using chunked processing."""
        generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
-        token_ids = original_prompt["prompt_token_ids"]

        # Split into chunks using max_position_embeddings
        max_pos_embeddings = self._get_max_position_embeddings()
@ -228,18 +222,12 @@ class EmbeddingMixin(OpenAIServing):
            chunk_request_id = f"{ctx.request_id}-prompt-{prompt_idx}-chunk-{chunk_idx}"

            # Create engine prompt for this chunk
-            chunk_engine_prompt = EngineTokensPrompt(prompt_token_ids=chunk_tokens)
-
-            # Create chunk request prompt for logging
-            chunk_text = ""
-            chunk_request_prompt = TextTokensPrompt(
-                prompt=chunk_text, prompt_token_ids=chunk_tokens
-            )
+            chunk_engine_prompt = TokensPrompt(prompt_token_ids=chunk_tokens)

            # Log the chunk
            self._log_inputs(
                chunk_request_id,
-                chunk_request_prompt,
+                chunk_engine_prompt,
                params=pooling_params,
                lora_request=ctx.lora_request,
            )
@ -263,7 +251,7 @@ class EmbeddingMixin(OpenAIServing):
        request,
        input_ids: list[int],
        input_text: str,
-    ) -> TextTokensPrompt:
+    ) -> TokensPrompt:
        """Override to support chunked processing for embedding requests."""
        token_num = len(input_ids)

@ -328,23 +316,15 @@ class EmbeddingMixin(OpenAIServing):
                        )
                    )

-            return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
+            return TokensPrompt(prompt=input_text, prompt_token_ids=input_ids)

        # For other request types, use the parent's implementation
        return super()._validate_input(request, input_ids, input_text)

-    def _is_text_tokens_prompt(self, prompt) -> bool:
-        """Check if a prompt is a TextTokensPrompt (has prompt_token_ids)."""
-        return (
-            isinstance(prompt, dict)
-            and "prompt_token_ids" in prompt
-            and "prompt_embeds" not in prompt
-        )
-
    async def _create_single_prompt_generator(
        self,
        ctx: EmbeddingServeContext,
-        engine_prompt: EngineTokensPrompt,
+        engine_prompt: TokensPrompt,
        pooling_params: PoolingParams,
        trace_headers: Mapping[str, str] | None,
        prompt_index: int,
@ -413,14 +393,16 @@ class EmbeddingMixin(OpenAIServing):

            for i, engine_prompt in enumerate(ctx.engine_prompts):
                # Check if this specific prompt needs chunked processing
-                if self._is_text_tokens_prompt(engine_prompt):
-                    # Cast to TextTokensPrompt since we've verified
-                    # prompt_token_ids
-                    text_tokens_prompt = cast(TextTokensPrompt, engine_prompt)
-                    if len(text_tokens_prompt["prompt_token_ids"]) > max_pos_embeddings:
+                if "prompt_token_ids" in engine_prompt:
+                    prompt_token_ids = engine_prompt["prompt_token_ids"]
+                    if len(prompt_token_ids) > max_pos_embeddings:
                        # Use chunked processing for this prompt
                        chunk_generators = await self._process_chunked_request(
-                            ctx, text_tokens_prompt, pooling_params, trace_headers, i
+                            ctx,
+                            prompt_token_ids,
+                            pooling_params,
+                            trace_headers,
+                            i,
                        )
                        generators.extend(chunk_generators)
                        continue
@ -578,14 +560,13 @@ class EmbeddingMixin(OpenAIServing):

                        # Get original prompt token IDs for this prompt
                        original_prompt = ctx.engine_prompts[prompt_idx]
-                        if not self._is_text_tokens_prompt(original_prompt):
+                        if "prompt_token_ids" not in original_prompt:
                            return self.create_error_response(
-                                f"Chunked prompt {prompt_idx} is not a TextTokensPrompt"
+                                f"Chunked prompt {prompt_idx} does not contain "
+                                "token IDs"
                            )

-                        original_token_ids = cast(TextTokensPrompt, original_prompt)[
-                            "prompt_token_ids"
-                        ]
+                        original_token_ids = original_prompt["prompt_token_ids"]

                        pooling_request_output = PoolingRequestOutput(
                            request_id=aggregator["request_id"],
--- a/vllm/entrypoints/pooling/pooling/serving.py
+++ b/vllm/entrypoints/pooling/pooling/serving.py
@ -137,11 +137,8 @@ class OpenAIServingPooling(OpenAIServing):
                )
                if error_check_ret is not None:
                    return error_check_ret
-                (
-                    _,
-                    _,
-                    engine_prompts,
-                ) = await self._preprocess_chat(
+
+                _, engine_prompts = await self._preprocess_chat(
                    request,
                    tokenizer,
                    request.messages,
--- a/vllm/entrypoints/renderer.py
+++ b/vllm/entrypoints/renderer.py
@ -12,9 +12,7 @@ import torch
 from pydantic import Field

 from vllm.config import ModelConfig
-from vllm.inputs.data import EmbedsPrompt as EngineEmbedsPrompt
-from vllm.inputs.data import TextPrompt as EngineTextPrompt
-from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
+from vllm.inputs.data import EmbedsPrompt, TextPrompt, TokensPrompt
 from vllm.inputs.parse import get_prompt_components, parse_raw_prompts
 from vllm.tokenizers import TokenizerLike
 from vllm.utils.async_utils import AsyncMicrobatchTokenizer
@ -97,7 +95,7 @@ class BaseRenderer(ABC):
        *,
        prompt_or_prompts: str | list[str] | list[int] | list[list[int]],
        config: RenderConfig,
-    ) -> list[EngineTokensPrompt]:
+    ) -> list[TokensPrompt]:
        """
        Convert text or token inputs into engine-ready TokensPrompt objects.

@ -115,7 +113,7 @@ class BaseRenderer(ABC):
                (e.g., tokenization and length handling).

        Returns:
-            list[EngineTokensPrompt]: Engine-ready token prompts.
+            list[TokensPrompt]: Engine-ready token prompts.

        Raises:
            ValueError: If input formats are invalid or length limits exceeded.
@ -129,7 +127,7 @@ class BaseRenderer(ABC):
        prompt_or_prompts: str | list[str] | list[int] | list[list[int]] | None = None,
        prompt_embeds: bytes | list[bytes] | None = None,
        config: RenderConfig,
-    ) -> list[EngineTokensPrompt | EngineEmbedsPrompt]:
+    ) -> list[TokensPrompt | EmbedsPrompt]:
        """
        Convert text/token and/or base64-encoded embeddings inputs into
        engine-ready prompt objects using a unified RenderConfig.
@ -146,7 +144,7 @@ class BaseRenderer(ABC):
                (e.g., tokenization and length handling).

        Returns:
-            list[Union[EngineTokensPrompt, EngineEmbedsPrompt]]:
+            list[Union[TokensPrompt, EmbedsPrompt]]:
                Engine-ready prompt objects.

        Raises:
@ -161,14 +159,14 @@ class BaseRenderer(ABC):
        prompt_embeds: bytes | list[bytes],
        truncate_prompt_tokens: Annotated[int, Field(ge=0)] | None = None,
        cache_salt: str | None = None,
-    ) -> list[EngineEmbedsPrompt]:
+    ) -> list[EmbedsPrompt]:
        """Load and validate base64-encoded embeddings into prompt objects."""
        if not self.model_config.enable_prompt_embeds:
            raise ValueError(
                "You must set `--enable-prompt-embeds` to input `prompt_embeds`."
            )

-        def _load_and_validate_embed(embed: bytes) -> EngineEmbedsPrompt:
+        def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt:
            tensor = torch.load(
                io.BytesIO(pybase64.b64decode(embed, validate=True)),
                weights_only=True,
@ -185,7 +183,7 @@ class BaseRenderer(ABC):
                assert tensor.dim() == 2
            if truncate_prompt_tokens is not None:
                tensor = tensor[-truncate_prompt_tokens:]
-            embeds_prompt = EngineEmbedsPrompt(prompt_embeds=tensor)
+            embeds_prompt = EmbedsPrompt(prompt_embeds=tensor)
            if cache_salt is not None:
                embeds_prompt["cache_salt"] = cache_salt
            return embeds_prompt
@ -213,7 +211,7 @@ class CompletionRenderer(BaseRenderer):
        *,
        prompt_or_prompts: str | list[str] | list[int] | list[list[int]],
        config: RenderConfig,
-    ) -> list[EngineTokensPrompt]:
+    ) -> list[TokensPrompt]:
        """Implementation of prompt rendering for completion-style requests.

        Uses async tokenizer pooling for improved performance. See base class
@ -240,7 +238,7 @@ class CompletionRenderer(BaseRenderer):
        prompt_or_prompts: str | list[str] | list[int] | list[list[int]] | None = None,
        prompt_embeds: bytes | list[bytes] | None = None,
        config: RenderConfig,
-    ) -> list[EngineTokensPrompt | EngineEmbedsPrompt]:
+    ) -> list[TokensPrompt | EmbedsPrompt]:
        """
        Render text/token prompts and/or precomputed embedding prompts. At
        least one of `prompt_or_prompts` or `prompt_embeds` must be provided.
@ -249,7 +247,7 @@ class CompletionRenderer(BaseRenderer):
        if truncate_prompt_tokens == 0:
            return []

-        rendered: list[EngineTokensPrompt | EngineEmbedsPrompt] = []
+        rendered: list[TokensPrompt | EmbedsPrompt] = []

        if prompt_embeds is not None:
            rendered.extend(
@ -281,10 +279,10 @@ class CompletionRenderer(BaseRenderer):

    async def _create_prompt(
        self,
-        prompt_input: EngineTextPrompt | EngineTokensPrompt,
+        prompt_input: TextPrompt | TokensPrompt,
        config: RenderConfig,
        truncate_prompt_tokens: int | None,
-    ) -> EngineTokensPrompt:
+    ) -> TokensPrompt:
        prompt, prompt_token_ids, _ = get_prompt_components(prompt_input)

        if prompt_token_ids is not None:
@ -317,7 +315,7 @@ class CompletionRenderer(BaseRenderer):
        truncate_prompt_tokens: int | None,
        add_special_tokens: bool,
        cache_salt: str | None,
-    ) -> EngineTokensPrompt:
+    ) -> TokensPrompt:
        """Tokenize text input asynchronously."""
        async_tokenizer = self._get_async_tokenizer()

@ -350,7 +348,7 @@ class CompletionRenderer(BaseRenderer):
        truncate_prompt_tokens: int | None,
        cache_salt: str | None,
        needs_detokenization: bool | None = False,
-    ) -> EngineTokensPrompt:
+    ) -> TokensPrompt:
        """Optionally detokenize token IDs and build a tokens prompt."""
        token_ids = self._maybe_apply_truncation(token_ids, truncate_prompt_tokens)

@ -392,8 +390,8 @@ class CompletionRenderer(BaseRenderer):
        max_length: int | None = None,
        cache_salt: str | None = None,
        prompt: str | None = None,
-    ) -> EngineTokensPrompt:
-        """Create validated EngineTokensPrompt."""
+    ) -> TokensPrompt:
+        """Create validated TokensPrompt."""
        if max_length is not None and len(token_ids) > max_length:
            raise ValueError(
                f"This model's maximum context length is {max_length} tokens. "
@ -401,7 +399,7 @@ class CompletionRenderer(BaseRenderer):
                "Please reduce the length of the input messages."
            )

-        tokens_prompt = EngineTokensPrompt(prompt_token_ids=token_ids)
+        tokens_prompt = TokensPrompt(prompt_token_ids=token_ids)
        if cache_salt is not None:
            tokens_prompt["cache_salt"] = cache_salt
        if prompt is not None:
--- a/vllm/entrypoints/serve/disagg/serving.py
+++ b/vllm/entrypoints/serve/disagg/serving.py
@ -27,7 +27,7 @@ from vllm.entrypoints.serve.disagg.protocol import (
    GenerateResponse,
    GenerateResponseChoice,
 )
-from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
+from vllm.inputs.data import TokensPrompt
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob
 from vllm.outputs import RequestOutput
@ -99,7 +99,7 @@ class ServingTokens(OpenAIServing):

        # TODO(NickLucche): Change to EngineCoreRequest once Renderer work is
        # completed
-        engine_prompt = EngineTokensPrompt(prompt_token_ids=request.token_ids)
+        engine_prompt = TokensPrompt(prompt_token_ids=request.token_ids)
        if request.features is not None:
            engine_prompt["multi_modal_data"] = None

@ -115,7 +115,7 @@ class ServingTokens(OpenAIServing):

            self._log_inputs(
                request_id,
-                request.token_ids,
+                TokensPrompt(prompt_token_ids=request.token_ids),
                params=sampling_params,
                lora_request=lora_request,
            )
--- a/vllm/entrypoints/serve/tokenize/serving.py
+++ b/vllm/entrypoints/serve/tokenize/serving.py
@ -21,6 +21,7 @@ from vllm.entrypoints.openai.protocol import (
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.renderer import RenderConfig
+from vllm.inputs import TokensPrompt
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike

@ -80,11 +81,8 @@ class OpenAIServingTokenization(OpenAIServing):
                )
                if error_check_ret is not None:
                    return error_check_ret
-                (
-                    _,
-                    _,
-                    engine_prompts,
-                ) = await self._preprocess_chat(
+
+                _, engine_prompts = await self._preprocess_chat(
                    request,
                    tokenizer,
                    request.messages,
@ -141,7 +139,10 @@ class OpenAIServingTokenization(OpenAIServing):
        tokenizer = await self.engine_client.get_tokenizer()

        self._log_inputs(
-            request_id, request.tokens, params=None, lora_request=lora_request
+            request_id,
+            TokensPrompt(prompt_token_ids=request.tokens),
+            params=None,
+            lora_request=lora_request,
        )

        prompt_input = await self._tokenize_prompt_input_async(
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@ -33,22 +33,31 @@ def parse_raw_prompts(
        if len(prompt) == 0:
            raise ValueError("please provide at least one prompt")

+        # case 2: array of strings
        if is_list_of(prompt, str):
-            # case 2: array of strings
            prompt = cast(list[str], prompt)
            return [TextPrompt(prompt=elem) for elem in prompt]
+
+        # case 3: array of tokens
        if is_list_of(prompt, int):
-            # case 3: array of tokens
            prompt = cast(list[int], prompt)
            return [TokensPrompt(prompt_token_ids=prompt)]
-        if is_list_of(prompt, list):
-            prompt = cast(list[list[int]], prompt)
-            if len(prompt[0]) == 0:
-                raise ValueError("please provide at least one prompt")

-            if is_list_of(prompt[0], int):
-                # case 4: array of token arrays
-                return [TokensPrompt(prompt_token_ids=elem) for elem in prompt]
+        # case 4: array of token arrays
+        if is_list_of(prompt, list):
+            first = prompt[0]
+            if not isinstance(first, list):
+                raise ValueError("prompt expected to be a list of lists")
+
+            if len(first) == 0:
+                raise ValueError("Please provide at least one prompt")
+
+            # strict validation: every nested list must be list[int]
+            if not all(is_list_of(elem, int) for elem in prompt):
+                raise TypeError("Nested lists must contain only integers")
+
+            prompt = cast(list[list[int]], prompt)
+            return [TokensPrompt(prompt_token_ids=elem) for elem in prompt]

    raise TypeError(
        "prompt must be a string, array of strings, "
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@ -885,12 +885,11 @@ def get_moe_configs(

    # If no optimized configuration is available, we will use the default
    # configuration
-    logger.warning(
-        (
-            "Using default MoE config. Performance might be sub-optimal! "
-            "Config file not found at %s"
-        ),
-        config_file_paths,
+    logger.warning_once(
+        "Using default MoE config. Performance might be sub-optimal! "
+        "Config file not found at %s",
+        ", ".join(config_file_paths),
+        scope="local",
    )
    return None

--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@ -369,7 +369,9 @@ class FusedMoE(CustomOp):
            # aux_stream() returns None on non-cuda-alike platforms.
            self.shared_experts_stream = aux_stream()
            if self.shared_experts_stream is not None:
-                logger.info_once("Enabled separate cuda stream for MoE shared_experts")
+                logger.info_once(
+                    "Enabled separate cuda stream for MoE shared_experts", scope="local"
+                )

        if params_dtype is None:
            params_dtype = torch.get_default_dtype()
--- a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
@ -30,8 +30,8 @@ class SharedFusedMoE(FusedMoE):

        # Disable shared expert overlap if:
        #   - we are using eplb, because of correctness issues
-        #   - we are using flashinfer with DP, since there nothint to gain
-        #   - we are using marlin kjernels
+        #   - we are using flashinfer with DP, since there nothing to gain
+        #   - we are using marlin kernels
        self.use_overlapped = (
            use_overlapped
            and not (
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/init.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/init.py
@ -62,7 +62,7 @@ def choose_scaled_mm_linear_kernel(
            continue

        # If the current platform uses compute_capability,
-        # make sure the kernel supports the compute cability.
+        # make sure the kernel supports the compute capability.
        is_supported, reason = kernel.is_supported(compute_capability)
        if not is_supported:
            failure_reasons.append(f"{kernel.__name__}: {reason}")
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@ -188,7 +188,24 @@ class ModelOptQuantConfigBase(QuantizationConfig):

    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
        if len(self.exclude_modules) > 0:
-            self.exclude_modules = hf_to_vllm_mapper.apply_list(self.exclude_modules)
+            # This is a workaround for the weights remapping issue:
+            # https://github.com/vllm-project/vllm/issues/28072
+            # Right now, the Nvidia ModelOpt library use just one wildcard pattern:
+            #        module_path*
+            # It gets applied if the whole tree of modules rooted at module_path
+            # is not quantized. Here we replace such pattern by 2 patterns that are
+            # collectively equivalent to the original pattern:
+            #        module_path
+            #        module_path.*
+            new_exclude_modules = []
+            for exclude in self.exclude_modules:
+                if len(exclude) >= 2 and exclude[-1] == "*" and exclude[-2] != ".":
+                    new_exclude_modules.append(exclude[:-1])
+                    new_exclude_modules.append(exclude[:-1] + ".*")
+                else:
+                    new_exclude_modules.append(exclude)
+
+            self.exclude_modules = hf_to_vllm_mapper.apply_list(new_exclude_modules)

    @staticmethod
    def get_config_filenames() -> list[str]:
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@ -17,6 +17,9 @@ from vllm.model_executor.layers.fused_moe.layer import (
    FusedMoEMethodBase,
    FusedMoeWeightScaleSupported,
 )
+from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import (
+    UnquantizedFusedMoEMethod,
+)
 from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
@ -162,6 +165,8 @@ class MoeWNA16Config(QuantizationConfig):
        self, layer: torch.nn.Module, prefix: str
    ) -> Optional["QuantizeMethodBase"]:
        if is_layer_skipped_quant(prefix, self.modules_to_not_convert):
+            if isinstance(layer, FusedMoE):
+                return UnquantizedFusedMoEMethod(layer.moe_config)
            return UnquantizedLinearMethod()
        elif isinstance(layer, LinearBase):
            # Avoid circular import
--- a/vllm/model_executor/models/audioflamingo3.py
+++ b/vllm/model_executor/models/audioflamingo3.py
@ -0,0 +1,639 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The vLLM team.
+# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Annotated, Any, Literal, TypeAlias
+
+import torch
+import torch.nn as nn
+from transformers import BatchFeature, PretrainedConfig
+from transformers.models.audioflamingo3 import (
+    AudioFlamingo3Config,
+    AudioFlamingo3Processor,
+)
+from transformers.models.qwen2_audio import Qwen2AudioEncoder
+
+from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import (
+    DictEmbeddingItems,
+    ModalityData,
+    ModalityDataItems,
+    MultiModalDataItems,
+    MultiModalDataParser,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
+
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from .utils import (
+    AutoWeightsLoader,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
+
+MAX_AUDIO_LEN = 10 * 60
+
+
+# === Audio Inputs === #
+class AudioFlamingo3FeatureInputs(TensorSchema):
+    """
+    Dimensions:
+        - num_chunks: Number of audio chunks (flattened)
+        - nmb: Number of mel bins
+        - num_audios: Number of original audio files
+    """
+
+    type: Literal["audio_features"]
+    input_features: Annotated[
+        torch.Tensor | list[torch.Tensor],
+        TensorShape("num_chunks", "nmb", 3000),
+    ]
+
+    feature_attention_mask: Annotated[
+        torch.Tensor,
+        TensorShape("num_chunks", 3000),
+    ]
+
+    chunk_counts: Annotated[
+        torch.Tensor,
+        TensorShape("num_audios"),
+    ]
+
+
+class AudioFlamingo3EmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size
+        - naf: Number of audio features
+        - hs: Hidden size (must match the hidden size of language model
+          backbone)
+    """
+
+    type: Literal["audio_embeds"] = "audio_embeds"
+
+    audio_embeds: Annotated[
+        list[torch.Tensor],
+        TensorShape("bn", "naf", "hs"),
+    ]
+
+
+AudioFlamingo3Inputs: TypeAlias = (
+    AudioFlamingo3FeatureInputs | AudioFlamingo3EmbeddingInputs
+)
+
+
+class AudioFlamingo3Encoder(Qwen2AudioEncoder):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+    ):
+        super().__init__(config)
+        self.avg_pooler = nn.AvgPool1d(kernel_size=2, stride=2)
+        # self.layer_norm is already initialized in super().__init__
+
+    def forward(
+        self,
+        input_features: torch.Tensor | list[torch.Tensor],
+        attention_mask: torch.Tensor = None,
+    ):
+        # input_features: (batch, num_mel_bins, seq_len)
+        if isinstance(input_features, list):
+            input_features = torch.stack(input_features)
+
+        hidden_states = nn.functional.gelu(self.conv1(input_features))
+        hidden_states = nn.functional.gelu(self.conv2(hidden_states))
+        hidden_states = hidden_states.transpose(-1, -2)
+        hidden_states = (
+            hidden_states + self.embed_positions.weight[: hidden_states.size(-2), :]
+        ).to(hidden_states.dtype)
+
+        for layer in self.layers:
+            layer_outputs = layer(hidden_states, attention_mask)
+            hidden_states = layer_outputs[0]
+
+        # AvgPool (time/2) + LayerNorm
+        # hidden_states: (batch, seq_len, hidden_size)
+        hidden_states = hidden_states.permute(0, 2, 1)  # (batch, hidden_size, seq_len)
+        hidden_states = self.avg_pooler(hidden_states)
+        hidden_states = hidden_states.permute(
+            0, 2, 1
+        )  # (batch, seq_len/2, hidden_size)
+        hidden_states = self.layer_norm(hidden_states)
+
+        return hidden_states
+
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.Tensor):
+        """
+        Computes the output length of the convolutional layers and the output length
+        of the audio encoder
+        """
+        input_lengths = (input_lengths - 1) // 2 + 1
+        output_lengths = (input_lengths - 2) // 2 + 1
+        return input_lengths, output_lengths
+
+
+class AudioFlamingo3MultiModalProjector(nn.Module):
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.linear_1 = nn.Linear(
+            config.audio_config.hidden_size,
+            config.text_config.hidden_size,
+            bias=config.projector_bias,
+        )
+        self.act = get_act_fn(config.projector_hidden_act)
+        self.linear_2 = nn.Linear(
+            config.text_config.hidden_size,
+            config.text_config.hidden_size,
+            bias=config.projector_bias,
+        )
+
+    def forward(self, audio_features):
+        hidden_states = self.linear_1(audio_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class AudioFlamingo3ProcessingInfo(BaseProcessingInfo):
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(AudioFlamingo3Config)
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(AudioFlamingo3Processor, **kwargs)
+
+    def get_feature_extractor(self, **kwargs: object):
+        hf_processor = self.get_hf_processor(**kwargs)
+        feature_extractor = hf_processor.feature_extractor
+        return feature_extractor
+
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+        return {"audio": None}
+
+
+class AudioFlamingo3DummyInputsBuilder(
+    BaseDummyInputsBuilder[AudioFlamingo3ProcessingInfo]
+):
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+        hf_processor = self.info.get_hf_processor()
+        audio_token = hf_processor.audio_token
+        return audio_token * num_audios
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
+    ) -> MultiModalDataDict:
+        feature_extractor = self.info.get_feature_extractor()
+        sampling_rate = feature_extractor.sampling_rate
+        audio_len = MAX_AUDIO_LEN * sampling_rate
+        num_audios = mm_counts.get("audio", 0)
+        audio_overrides = mm_options.get("audio") if mm_options else None
+
+        return {
+            "audio": self._get_dummy_audios(
+                length=audio_len,
+                num_audios=num_audios,
+                overrides=audio_overrides,
+            )
+        }
+
+
+def _audioflamingo3_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+    chunk_counts = hf_inputs.get("chunk_counts")
+    if chunk_counts is not None:
+        return dict(
+            audio_embeds=MultiModalFieldConfig.batched("audio"),
+            input_features=MultiModalFieldConfig.flat_from_sizes(
+                "audio", chunk_counts, dim=0
+            ),
+            feature_attention_mask=MultiModalFieldConfig.flat_from_sizes(
+                "audio", chunk_counts, dim=0
+            ),
+            chunk_counts=MultiModalFieldConfig.batched("audio"),
+        )
+    return dict(
+        audio_embeds=MultiModalFieldConfig.batched("audio"),
+        input_features=MultiModalFieldConfig.batched("audio"),
+        feature_attention_mask=MultiModalFieldConfig.batched("audio"),
+        chunk_counts=MultiModalFieldConfig.batched("audio"),
+    )
+
+
+class AudioFlamingo3MultiModalDataParser(MultiModalDataParser):
+    def _parse_audio_data(
+        self,
+        data: dict[str, torch.Tensor] | ModalityData[Any],
+    ) -> ModalityDataItems[Any, Any] | None:
+        if isinstance(data, dict):
+            return DictEmbeddingItems(
+                data,
+                modality="audio",
+                required_fields={"audio_embeds"},
+                fields_factory=_audioflamingo3_field_config,
+            )
+        return super()._parse_audio_data(data)
+
+
+class AudioFlamingo3MultiModalProcessor(
+    BaseMultiModalProcessor[AudioFlamingo3ProcessingInfo]
+):
+    def _get_data_parser(self) -> MultiModalDataParser:
+        feature_extractor = self.info.get_feature_extractor()
+        return AudioFlamingo3MultiModalDataParser(
+            target_sr=feature_extractor.sampling_rate
+        )
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: dict[str, object],
+        mm_kwargs: Mapping[str, Any],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        audios = mm_data.pop("audios", [])
+        if audios:
+            mm_data["audio"] = audios
+
+        if not mm_data.get("audio", []):
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        feature_extractor = self.info.get_feature_extractor(**mm_kwargs)
+        mm_kwargs = dict(
+            **mm_kwargs,
+            sampling_rate=feature_extractor.sampling_rate,
+        )
+
+        # Calculate chunk counts
+        audio_list = mm_data.get("audio")
+        if not isinstance(audio_list, list):
+            audio_list = [audio_list]
+
+        chunk_counts = []
+        sampling_rate = feature_extractor.sampling_rate
+        chunk_length = feature_extractor.chunk_length
+        window_size = int(sampling_rate * chunk_length)
+        # MAX_AUDIO_LEN is 10 * 60 in HF processor.
+        max_windows = int(MAX_AUDIO_LEN // chunk_length)
+
+        for audio in audio_list:
+            # audio is numpy array or list
+            n_samples = len(audio) if isinstance(audio, list) else audio.shape[0]
+
+            n_win = max(1, (n_samples + window_size - 1) // window_size)
+            if n_win > max_windows:
+                n_win = max_windows
+            chunk_counts.append(n_win)
+
+        outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+        if "input_features_mask" in outputs:
+            outputs["feature_attention_mask"] = outputs.pop("input_features_mask")
+
+        outputs["chunk_counts"] = torch.tensor(chunk_counts, dtype=torch.long)
+
+        return outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _audioflamingo3_field_config(hf_inputs)
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptUpdate]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+
+        audio_token = getattr(processor, "audio_token", "<sound>")
+        audio_token_id = vocab.get(audio_token)
+        if audio_token_id is None:
+            # Fallback if not found, though it should be there
+            audio_token_id = processor.audio_token_id
+
+        out_mm_data = out_mm_kwargs.get_data()
+        feature_attention_mask = out_mm_data.get("feature_attention_mask")
+        chunk_counts = out_mm_data.get("chunk_counts")
+
+        def get_replacement_audioflamingo3(item_idx: int):
+            if feature_attention_mask is not None:
+                if chunk_counts is not None:
+                    counts = (
+                        chunk_counts.tolist()
+                        if isinstance(chunk_counts, torch.Tensor)
+                        else chunk_counts
+                    )
+                    start_idx = sum(counts[:item_idx])
+                    count = counts[item_idx]
+                    end_idx = start_idx + count
+
+                    if isinstance(feature_attention_mask, list):
+                        mask_list = feature_attention_mask[start_idx:end_idx]
+                        if len(mask_list) > 0 and isinstance(
+                            mask_list[0], torch.Tensor
+                        ):
+                            mask = torch.stack(mask_list)
+                        else:
+                            mask = torch.tensor(mask_list)
+                    else:
+                        mask = feature_attention_mask[start_idx:end_idx]
+                else:
+                    # feature_attention_mask is list[Tensor] or Tensor
+                    if isinstance(feature_attention_mask, list):
+                        mask = feature_attention_mask[item_idx]
+                    else:
+                        mask = feature_attention_mask[item_idx].unsqueeze(0)
+
+                # mask shape: (num_chunks, 3000)
+                input_lengths = mask.sum(-1)
+                conv_lengths = (input_lengths - 1) // 2 + 1
+                audio_output_lengths = (conv_lengths - 2) // 2 + 1
+                num_features = audio_output_lengths.sum().item()
+            else:
+                audio_embeds = out_mm_data["audio_embeds"][item_idx]
+                num_features = audio_embeds.shape[0]
+
+            if num_features == 0:
+                raise ValueError("Audio is too short")
+
+            audio_tokens = [audio_token_id] * int(num_features)
+            return PromptUpdateDetails.select_token_id(
+                audio_tokens,
+                embed_token_id=audio_token_id,
+            )
+
+        return [
+            PromptReplacement(
+                modality="audio",
+                target=audio_token,
+                replacement=get_replacement_audioflamingo3,
+            )
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    AudioFlamingo3MultiModalProcessor,
+    info=AudioFlamingo3ProcessingInfo,
+    dummy_inputs=AudioFlamingo3DummyInputsBuilder,
+)
+class AudioFlamingo3ForConditionalGeneration(
+    nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
+):
+    """
+    AudioFlamingo3 model for conditional generation.
+
+    This model integrates a Whisper-based audio encoder with a Qwen2 language model.
+    It supports multi-chunk audio processing.
+    """
+
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model.",
+            connector="multi_modal_projector.",
+            tower_model="audio_tower.",
+        )
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.audio_tower = AudioFlamingo3Encoder(
+            config.audio_config,
+        )
+        self.multi_modal_projector = AudioFlamingo3MultiModalProjector(config)
+
+        self.quant_config = quant_config
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            architectures=["Qwen2ForCausalLM"],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
+
+    def _parse_and_validate_audio_input(
+        self, **kwargs: object
+    ) -> AudioFlamingo3Inputs | None:
+        input_features = kwargs.pop("input_features", None)
+        audio_embeds = kwargs.pop("audio_embeds", None)
+        feature_attention_mask = kwargs.pop("feature_attention_mask", None)
+        chunk_counts = kwargs.pop("chunk_counts", None)
+
+        if input_features is None and audio_embeds is None:
+            return None
+
+        if audio_embeds is not None:
+            return AudioFlamingo3EmbeddingInputs(
+                type="audio_embeds", audio_embeds=audio_embeds
+            )
+
+        if input_features is not None:
+            return AudioFlamingo3FeatureInputs(
+                type="audio_features",
+                input_features=input_features,
+                feature_attention_mask=feature_attention_mask,
+                chunk_counts=chunk_counts,
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_audio_input(
+        self, audio_input: AudioFlamingo3Inputs
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
+        if audio_input["type"] == "audio_embeds":
+            audio_embeds = audio_input["audio_embeds"]
+            return tuple(audio_embeds)
+
+        input_features = audio_input["input_features"]
+        feature_attention_mask = audio_input["feature_attention_mask"]
+        chunk_counts = audio_input.get("chunk_counts")
+
+        if isinstance(input_features, list):
+            input_features = torch.cat(input_features, dim=0)
+            feature_attention_mask = torch.cat(feature_attention_mask, dim=0)
+
+        if chunk_counts is None:
+            chunk_counts = [1] * input_features.shape[0]
+        elif isinstance(chunk_counts, torch.Tensor):
+            chunk_counts = chunk_counts.tolist()
+        elif (
+            isinstance(chunk_counts, list)
+            and chunk_counts
+            and isinstance(chunk_counts[0], torch.Tensor)
+        ):
+            chunk_counts = [c.item() for c in chunk_counts]
+
+        # Calculate output lengths
+        input_lengths = feature_attention_mask.sum(-1)
+        # Conv downsampling
+        conv_lengths = (input_lengths - 1) // 2 + 1
+        # AvgPool downsampling
+        audio_output_lengths = (conv_lengths - 2) // 2 + 1
+
+        batch_size, _, max_mel_seq_len = input_features.shape
+
+        # Calculate max_seq_len after convs (before pooling) for attention mask
+        max_seq_len = (max_mel_seq_len - 1) // 2 + 1
+
+        # Create a sequence tensor of shape (batch_size, max_seq_len)
+        seq_range = (
+            torch.arange(
+                0,
+                max_seq_len,
+                dtype=conv_lengths.dtype,
+                device=conv_lengths.device,
+            )
+            .unsqueeze(0)
+            .expand(batch_size, max_seq_len)
+        )
+        lengths_expand = conv_lengths.unsqueeze(-1).expand(batch_size, max_seq_len)
+        # Create mask
+        padding_mask = seq_range >= lengths_expand
+
+        audio_attention_mask_ = padding_mask.view(batch_size, 1, 1, max_seq_len).expand(
+            batch_size, 1, max_seq_len, max_seq_len
+        )
+        audio_attention_mask = audio_attention_mask_.to(
+            dtype=self.audio_tower.conv1.weight.dtype,
+            device=self.audio_tower.conv1.weight.device,
+        )
+        audio_attention_mask[audio_attention_mask_] = float("-inf")
+
+        # Forward pass
+        audio_features = self.audio_tower(
+            input_features, attention_mask=audio_attention_mask
+        )
+
+        # Project
+        audio_features = self.multi_modal_projector(audio_features)
+
+        # Masking after pooling
+        num_audios, max_audio_tokens, embed_dim = audio_features.shape
+        audio_output_lengths = audio_output_lengths.unsqueeze(1)
+        audio_features_mask = (
+            torch.arange(max_audio_tokens)
+            .expand(num_audios, max_audio_tokens)
+            .to(audio_output_lengths.device)
+            < audio_output_lengths
+        )
+        masked_audio_features = audio_features[audio_features_mask].view(-1, embed_dim)
+
+        # Split to tuple of embeddings for individual audio input.
+        chunk_embeddings = torch.split(
+            masked_audio_features, audio_output_lengths.flatten().tolist()
+        )
+
+        grouped_embeddings = []
+        current_idx = 0
+        for count in chunk_counts:
+            audio_chunks = chunk_embeddings[current_idx : current_idx + count]
+            grouped_embeddings.append(torch.cat(audio_chunks, dim=0))
+            current_idx += count
+        return tuple(grouped_embeddings)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
+        audio_input = self._parse_and_validate_audio_input(**kwargs)
+        if audio_input is None:
+            return []
+        masked_audio_features = self._process_audio_input(audio_input)
+        return masked_audio_features
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: object,
+    ) -> torch.Tensor | IntermediateTensors:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        hidden_states = self.language_model.model(
+            input_ids,
+            positions,
+            intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        return self.language_model.compute_logits(hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@ -612,7 +612,7 @@ class Qwen2_5_VisionTransformer(nn.Module):
        # DO NOT MOVE THIS IMPORT
        from vllm.compilation.backends import set_model_tag

-        with set_model_tag("Qwen2_5_VisionPatchEmbed"):
+        with set_model_tag("Qwen2_5_VisionPatchEmbed", is_encoder=True):
            self.patch_embed = Qwen2_5_VisionPatchEmbed(
                patch_size=patch_size,
                temporal_patch_size=temporal_patch_size,
@ -651,7 +651,7 @@ class Qwen2_5_VisionTransformer(nn.Module):
                f"Qwen2.5-VL does not support {self.attn_backend} backend now."
            )

-        with set_model_tag("Qwen2_5_VisionBlock"):
+        with set_model_tag("Qwen2_5_VisionBlock", is_encoder=True):
            self.blocks = nn.ModuleList(
                [
                    Qwen2_5_VisionBlock(
@ -670,7 +670,7 @@ class Qwen2_5_VisionTransformer(nn.Module):
                ]
            )

-        with set_model_tag("Qwen2_5_VisionPatchMerger"):
+        with set_model_tag("Qwen2_5_VisionPatchMerger", is_encoder=True):
            self.merger = Qwen2_5_VisionPatchMerger(
                d_model=vision_config.out_hidden_size,
                context_dim=self.hidden_size,
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@ -264,6 +264,10 @@ _CROSS_ENCODER_MODELS = {
 _MULTIMODAL_MODELS = {
    # [Decoder-only]
    "AriaForConditionalGeneration": ("aria", "AriaForConditionalGeneration"),
+    "AudioFlamingo3ForConditionalGeneration": (
+        "audioflamingo3",
+        "AudioFlamingo3ForConditionalGeneration",
+    ),
    "AyaVisionForConditionalGeneration": (
        "aya_vision",
        "AyaVisionForConditionalGeneration",
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@ -409,10 +409,11 @@ class CudaPlatformBase(Platform):
        )
        selected_index = sorted_indices[0]
        selected_backend = valid_backends_priorities[selected_index][0]
-        logger.info(
+        logger.info_once(
            "Using %s attention backend out of potential backends: %s",
            selected_backend.name,
-            [b[0].name for b in valid_backends_priorities],
+            tuple(b[0].name for b in valid_backends_priorities),
+            scope="local",
        )

        return selected_backend.get_path()
--- a/vllm/profiler/wrapper.py
+++ b/vllm/profiler/wrapper.py
@ -61,7 +61,7 @@ class WorkerProfiler(ABC):
        """Call _stop with error handling but no safeguards."""
        try:
            self._stop()
-            logger.info("Profiler stopped successfully.")
+            logger.info_once("Profiler stopped successfully.", scope="local")
        except Exception as e:
            logger.warning("Failed to stop profiler: %s", e)
        self._running = False  # Always mark as not running, assume stop worked
@ -91,7 +91,7 @@ class WorkerProfiler(ABC):
            and self._delay_iters > 0
            and self._active_iteration_count == self._delay_iters
        ):
-            logger.info("Starting profiler after delay...")
+            logger.info_once("Starting profiler after delay...", scope="local")
            self._call_start()

        if self._running:
@ -105,7 +105,9 @@ class WorkerProfiler(ABC):
            # Automatically stop the profiler after max iters
            # will be marked as not running, but leave as active so that stop
            # can clean up properly
-            logger.info("Max profiling iterations reached. Stopping profiler...")
+            logger.info_once(
+                "Max profiling iterations reached. Stopping profiler...", scope="local"
+            )
            self._call_stop()
            return

@ -125,7 +127,7 @@ class WorkerProfiler(ABC):

    def shutdown(self) -> None:
        """Ensure profiler is stopped when shutting down."""
-        logger.info_once("Shutting down profiler")
+        logger.info_once("Shutting down profiler", scope="local")
        if self._running:
            self.stop()

@ -156,9 +158,10 @@ class TorchProfilerWrapper(WorkerProfiler):
        self.profiler_config = profiler_config
        torch_profiler_trace_dir = profiler_config.torch_profiler_dir
        if local_rank in (None, 0):
-            logger.info(
+            logger.info_once(
                "Torch profiling enabled. Traces will be saved to: %s",
                torch_profiler_trace_dir,
+                scope="local",
            )
            logger.debug(
                "Profiler config: record_shapes=%s,"
--- a/vllm/v1/attention/backends/gdn_attn.py
+++ b/vllm/v1/attention/backends/gdn_attn.py
@ -211,7 +211,7 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
                spec_token_masks = torch.repeat_interleave(
                    spec_sequence_masks, query_lens
                )
-                index = torch.argsort(spec_token_masks)
+                index = torch.argsort(spec_token_masks, stable=True)
                num_non_spec_tokens = num_prefill_tokens + num_decode_tokens
                non_spec_token_indx = index[:num_non_spec_tokens]
                spec_token_indx = index[num_non_spec_tokens:]
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@ -706,7 +706,7 @@ class WorkerProc:
                    death_pipe.recv()
                except EOFError:
                    # Parent process has exited, terminate this worker
-                    logger.info("Parent process exited, terminating worker")
+                    logger.info_once("Parent process exited, terminating worker")
                    # Send signal to self to trigger clean shutdown
                    shutdown_event.set()
                except Exception as e:
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@ -268,13 +268,7 @@ def has_xgrammar_unsupported_json_features(schema: dict[str, Any]) -> bool:

        # Unsupported keywords for objects
        if obj.get("type") == "object" and any(
-            key in obj
-            for key in (
-                "minProperties",
-                "maxProperties",
-                "propertyNames",
-                "patternProperties",
-            )
+            key in obj for key in ("patternProperties", "propertyNames")
        ):
            return True
				`@ -0,0 +1 @@`
				{"transcriptions": ["There is no clear relationship between the barking and the music, as they seem to be independent of each other.", "(B) To indicate that language cannot express clearly, satirizing the inversion of black and white in the world"], "token_ids": [[3862, 374, 902, 2797, 5025, 1948, 279, 293, 33452, 323, 279, 4627, 11, 438, 807, 2803, 311, 387, 9489, 315, 1817, 1008, 13, 151645], [5349, 8, 2014, 13216, 429, 4128, 4157, 3158, 9355, 11, 7578, 404, 4849, 279, 46488, 315, 3691, 323, 4158, 304, 279, 1879, 151645, 151671]]}
				`@ -0,0 +1 @@`
				{"transcriptions": ["The content of the input audio is 'you can ask why over and over and over again forever even if one day we explain every physical interaction and scientific law and hope and dream and regret with a single elegant equation'."], "token_ids": [[785, 2213, 315, 279, 1946, 7699, 374, 364, 9330, 646, 2548, 3170, 916, 323, 916, 323, 916, 1549, 15683, 1496, 421, 825, 1899, 582, 10339, 1449, 6961, 16230, 323, 12344, 2329, 323, 3900, 323, 7904, 323, 22231, 448, 264, 3175, 25777, 23606, 4427, 151645]]}