Merge branch 'main' into mlm-full-lora-support

2026-06-26 22:37:33 +08:00 · 2025-12-04 22:25:42 +08:00 · 2025-12-04 22:25:42 +08:00 · c94cdf1c50
commit c94cdf1c50
parent e635861053 dd97e047e0
190 changed files with 7128 additions and 3855 deletions
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@ -1,46 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import os
-
-template = """<!DOCTYPE html>
-<html>
-    <body>
-    <h1>Links for vLLM</h1/>
-        <a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
-        <a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
-    </body>
-</html>
-"""
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--wheel", help="The wheel path.", required=True)
-args = parser.parse_args()
-
-filename = os.path.basename(args.wheel)
-
-with open("index.html", "w") as f:
-    print(f"Generated index.html for {args.wheel}")
-    # sync the abi tag with .buildkite/scripts/upload-wheels.sh
-    if "x86_64" in filename:
-        x86_wheel = filename
-        arm_wheel = filename.replace("x86_64", "aarch64").replace(
-            "manylinux1", "manylinux2014"
-        )
-    elif "aarch64" in filename:
-        x86_wheel = filename.replace("aarch64", "x86_64").replace(
-            "manylinux2014", "manylinux1"
-        )
-        arm_wheel = filename
-    else:
-        raise ValueError(f"Unsupported wheel: {filename}")
-    # cloudfront requires escaping the '+' character
-    f.write(
-        template.format(
-            x86_wheel=x86_wheel,
-            x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
-            arm_wheel=arm_wheel,
-            arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
-        )
-    )
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@ -7,13 +7,14 @@

 import argparse
 import json
-import re
 import sys
 from dataclasses import asdict, dataclass
 from pathlib import Path
 from typing import Any
 from urllib.parse import quote

+import regex as re
+
 if not sys.version_info >= (3, 12):
    raise RuntimeError("This script requires Python 3.12 or higher.")

--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@ -74,6 +74,7 @@ FROM ${BASE_IMAGE_NAME}

 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
+ENV SOC_VERSION="ascend910b1"

 RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
    pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
+THRESHOLD=${1:-0.25}
+NUM_Q=${2:-1319}
+PORT=${3:-8030}
+OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
+mkdir -p "${OUT_DIR}"
+
+wait_for_server() {
+  local port=$1
+  timeout 600 bash -c '
+    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
+      sleep 1
+    done'
+}
+
+MODEL="deepseek-ai/DeepSeek-V2-lite"
+
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+  # ROCm platform
+  BACKENDS=("allgather_reducescatter")
+  # Disable MOE padding for ROCm since it is causing eplb to fail
+  export VLLM_ROCM_MOE_PADDING=0
+else
+  # Non-ROCm platform (CUDA/other)
+  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi
+
+cleanup() {
+  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
+    kill "${SERVER_PID}" 2>/dev/null || true
+    for _ in {1..20}; do
+      kill -0 "${SERVER_PID}" 2>/dev/null || break
+      sleep 0.5
+    done
+    kill -9 "${SERVER_PID}" 2>/dev/null || true
+  fi
+}
+trap cleanup EXIT
+
+for BACK in "${BACKENDS[@]}"; do
+  VLLM_DEEP_GEMM_WARMUP=skip \
+  VLLM_ALL2ALL_BACKEND=$BACK \
+  vllm serve "$MODEL" \
+    --enforce-eager \
+    --tensor-parallel-size 2 \
+    --data-parallel-size 2 \
+    --enable-expert-parallel \
+    --enable-eplb \
+    --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
+    --trust-remote-code \
+    --max-model-len 2048 \
+    --port $PORT &
+  SERVER_PID=$!
+  wait_for_server $PORT
+
+  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
+  OUT="${OUT_DIR}/${TAG}_${BACK}_async_eplb.json"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+  python3 - <<PY
+import json; acc=json.load(open('${OUT}'))['accuracy']
+print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
+assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
+PY
+
+  cleanup
+  SERVER_PID=
+  sleep 1
+  PORT=$((PORT+1))
+done
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@ -50,6 +50,7 @@ for BACK in "${BACKENDS[@]}"; do
    --data-parallel-size 2 \
    --enable-expert-parallel \
    --enable-eplb \
+    --eplb-config '{"window_size":200,"step_interval":600}' \
    --trust-remote-code \
    --max-model-len 2048 \
    --port $PORT &
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
+THRESHOLD=${1:-0.25}
+NUM_Q=${2:-1319}
+PORT=${3:-8040}
+OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
+mkdir -p "${OUT_DIR}"
+
+wait_for_server() {
+  local port=$1
+  timeout 600 bash -c '
+    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
+      sleep 1
+    done'
+}
+
+MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
+
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+  # ROCm platform
+  BACKENDS=("allgather_reducescatter")
+  # Disable MOE padding for ROCm since it is causing eplb to fail
+  export VLLM_ROCM_MOE_PADDING=0
+else
+  # Non-ROCm platform (CUDA/other)
+  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi
+
+cleanup() {
+  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
+    kill "${SERVER_PID}" 2>/dev/null || true
+    for _ in {1..20}; do
+      kill -0 "${SERVER_PID}" 2>/dev/null || break
+      sleep 0.5
+    done
+    kill -9 "${SERVER_PID}" 2>/dev/null || true
+  fi
+}
+trap cleanup EXIT
+
+for BACK in "${BACKENDS[@]}"; do
+  VLLM_DEEP_GEMM_WARMUP=skip \
+  VLLM_ALL2ALL_BACKEND=$BACK \
+  vllm serve "$MODEL" \
+    --enforce-eager \
+    --tensor-parallel-size 4 \
+    --enable-expert-parallel \
+    --enable-eplb \
+    --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
+    --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
+    --trust-remote-code \
+    --max-model-len 2048 \
+    --gpu-memory-utilization 0.9 \
+    --port $PORT &
+  SERVER_PID=$!
+  wait_for_server $PORT
+
+  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
+  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+  python3 - <<PY
+import json; acc=json.load(open('${OUT}'))['accuracy']
+print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
+assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
+PY
+
+  cleanup
+  SERVER_PID=
+  sleep 1
+  PORT=$((PORT+1))
+done
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@ -81,7 +81,7 @@ else
    alias_arg=""
 fi

-$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
+$PYTHON pip install regex && .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg

 # copy indices to /<commit>/ unconditionally
 echo "Uploading indices to $S3_COMMIT_PREFIX"
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@ -987,7 +987,8 @@ steps:
  commands:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1

- label: Multi-Modal Models Test (Extended) 1
+- label: Multi-Modal Models Test (Extended) 1 # 60min
+  timeout_in_minutes: 120
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
@ -1011,7 +1012,8 @@ steps:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'

- label: Multi-Modal Models Test (Extended) 3
+- label: Multi-Modal Models Test (Extended) 3 # 75min
+  timeout_in_minutes: 150
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -387,6 +387,7 @@ steps:
  working_dir: "/vllm-workspace/examples"
  source_file_dependencies:
  - vllm/entrypoints
+  - vllm/multimodal
  - examples/
  commands:
    - pip install tensorizer # for tensorizer test
@ -1373,4 +1374,22 @@ steps:
  num_gpus: 2
  working_dir: "/vllm-workspace"
  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+
+- label: DeepSeek V2-Lite Async EPLB Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
+
+- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
--- a/README.md
+++ b/README.md
@ -137,6 +137,7 @@ Compute Resources:
 - Alibaba Cloud
 - AMD
 - Anyscale
+- Arm
 - AWS
 - Crusoe Cloud
 - Databricks
--- a/benchmarks/benchmark_hash.py
+++ b/benchmarks/benchmark_hash.py
@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Micro benchmark comparing built-in hash(), SHA-256, and xxHash.
+
+This focuses on a single test payload shaped like the prefix-cache hash input:
+    (32-byte bytes object, 32-int tuple)
+
+Usage:
+    python benchmarks/hash_micro_benchmark.py --iterations 20000
+"""
+
+from __future__ import annotations
+
+import argparse
+import random
+import statistics
+import time
+from collections.abc import Callable, Iterable
+
+from vllm.utils.hashing import sha256, xxhash
+
+
+def _generate_test_data(seed: int) -> tuple[bytes, tuple[int, ...]]:
+    """Generate a deterministic test payload."""
+    random.seed(seed)
+    bytes_data = bytes(random.getrandbits(8) for _ in range(32))
+    int_tuple = tuple(random.randint(1, 1_000_000) for _ in range(32))
+    return (bytes_data, int_tuple)
+
+
+def _benchmark_func(func: Callable[[tuple], object], data: tuple, iterations: int):
+    """Return (avg_seconds, std_seconds) for hashing `data` `iterations` times."""
+    times: list[float] = []
+
+    # Warm-up to avoid first-run noise.
+    for _ in range(200):
+        func(data)
+
+    for _ in range(iterations):
+        start = time.perf_counter()
+        func(data)
+        end = time.perf_counter()
+        times.append(end - start)
+
+    avg = statistics.mean(times)
+    std = statistics.stdev(times) if len(times) > 1 else 0.0
+    return avg, std
+
+
+def _run_benchmarks(
+    benchmarks: Iterable[tuple[str, Callable[[tuple], object]]],
+    data: tuple,
+    iterations: int,
+):
+    """Yield (name, avg, std) for each benchmark, skipping unavailable ones."""
+    for name, func in benchmarks:
+        try:
+            avg, std = _benchmark_func(func, data, iterations)
+        except ModuleNotFoundError as exc:
+            print(f"Skipping {name}: {exc}")
+            continue
+        yield name, avg, std
+
+
+def builtin_hash(data: tuple) -> int:
+    """Wrapper for Python's built-in hash()."""
+    return hash(data)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--iterations",
+        type=int,
+        default=10_000,
+        help="Number of measured iterations per hash function.",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=42, help="Random seed for test payload."
+    )
+    args = parser.parse_args()
+
+    data = _generate_test_data(args.seed)
+    benchmarks = (
+        ("SHA256 (pickle)", sha256),
+        ("xxHash (pickle)", xxhash),
+        ("built-in hash()", builtin_hash),
+    )
+
+    print("=" * 60)
+    print("HASH FUNCTION MICRO BENCHMARK")
+    print("=" * 60)
+    print("Test data: (32-byte bytes object, 32-int tuple)")
+    print(f"Iterations: {args.iterations:,}")
+    print("=" * 60)
+
+    results = list(_run_benchmarks(benchmarks, data, args.iterations))
+    builtin_entry = next((r for r in results if r[0] == "built-in hash()"), None)
+
+    print("\nResults:")
+    for name, avg, std in results:
+        print(f"  {name:16s}: {avg * 1e6:8.2f} ± {std * 1e6:6.2f} μs")
+
+    if builtin_entry:
+        _, builtin_avg, _ = builtin_entry
+        print("\n" + "=" * 60)
+        print("SUMMARY (relative to built-in hash())")
+        print("=" * 60)
+        for name, avg, _ in results:
+            if name == "built-in hash()":
+                continue
+            speed_ratio = avg / builtin_avg
+            print(f"• {name} is {speed_ratio:.1f}x slower than built-in hash()")
+    else:
+        print("\nBuilt-in hash() result missing; cannot compute speed ratios.")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/benchmark_prefix_block_hash.py
+++ b/benchmarks/benchmark_prefix_block_hash.py
@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Simple benchmark to compare prefix-cache block hashing algorithms.
+
+Example:
+    python benchmark_prefix_block_hash.py --num-blocks 20000 --block-size 32
+"""
+
+from __future__ import annotations
+
+import argparse
+import random
+import statistics
+import sys
+import time
+from collections.abc import Callable, Iterable, Sequence
+
+from vllm.utils.hashing import get_hash_fn_by_name
+from vllm.v1.core.kv_cache_utils import BlockHash, hash_block_tokens, init_none_hash
+
+SUPPORTED_ALGOS = ("sha256", "sha256_cbor", "xxhash", "xxhash_cbor")
+
+
+def _generate_blocks(
+    num_blocks: int, block_size: int, vocab_size: int, seed: int
+) -> list[list[int]]:
+    rng = random.Random(seed)
+    return [
+        [rng.randrange(vocab_size) for _ in range(block_size)]
+        for _ in range(num_blocks)
+    ]
+
+
+def _hash_all_blocks(
+    hash_fn: Callable[[object], bytes],
+    blocks: Iterable[Sequence[int]],
+) -> float:
+    parent_hash: BlockHash | None = None
+    start = time.perf_counter()
+    for block in blocks:
+        parent_hash = hash_block_tokens(hash_fn, parent_hash, block, extra_keys=None)
+    end = time.perf_counter()
+    return end - start
+
+
+def _benchmark(
+    hash_algo: str,
+    blocks: list[list[int]],
+    trials: int,
+) -> tuple[float, float, float] | None:
+    try:
+        hash_fn = get_hash_fn_by_name(hash_algo)
+        init_none_hash(hash_fn)
+        timings = [_hash_all_blocks(hash_fn, blocks) for _ in range(trials)]
+    except ModuleNotFoundError as exc:
+        print(f"Skipping {hash_algo}: {exc}", file=sys.stderr)
+        return None
+
+    avg = statistics.mean(timings)
+    best = min(timings)
+    # throughput: tokens / second
+    tokens_hashed = len(blocks) * len(blocks[0])
+    throughput = tokens_hashed / best
+    return avg, best, throughput
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--num-blocks", type=int, default=10000, help="Block count.")
+    parser.add_argument("--block-size", type=int, default=32, help="Tokens per block.")
+    parser.add_argument(
+        "--vocab-size", type=int, default=32000, help="Token id range [0, vocab_size)."
+    )
+    parser.add_argument("--seed", type=int, default=0, help="Random seed.")
+    parser.add_argument(
+        "--trials", type=int, default=5, help="Number of timed trials per algorithm."
+    )
+    parser.add_argument(
+        "--algorithms",
+        nargs="+",
+        default=SUPPORTED_ALGOS,
+        choices=SUPPORTED_ALGOS,
+        help="Hash algorithms to benchmark.",
+    )
+    args = parser.parse_args()
+
+    blocks = _generate_blocks(
+        args.num_blocks, args.block_size, args.vocab_size, args.seed
+    )
+    print(
+        f"Benchmarking {len(args.algorithms)} algorithms on "
+        f"{args.num_blocks} blocks (block size={args.block_size})."
+    )
+
+    for algo in args.algorithms:
+        result = _benchmark(algo, blocks, args.trials)
+        if result is None:
+            continue
+
+        avg, best, throughput = result
+        print(
+            f"{algo:14s} avg: {avg:.6f}s  best: {best:.6f}s  "
+            f"throughput: {throughput / 1e6:.2f}M tokens/s"
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
@ -0,0 +1,244 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from enum import Enum
+from itertools import product
+from typing import Any
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    _per_token_group_quant_fp8_colmajor,
+    silu_mul_per_token_group_quant_fp8_colmajor,
+)
+from vllm.triton_utils import triton
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+
+from .utils import ArgPool, Bench, CudaGraphBenchParams
+
+GROUP_SIZE = 128
+FLOAT8_T = torch.float8_e4m3fn
+
+
+def print_timers(timers: list[TMeasurement], cuda_graph_nops: int):
+    print(
+        f"Note : The timings reported above is for {cuda_graph_nops} "
+        "consecutive invocations of the benchmarking functions. "
+        f"Please divide by {cuda_graph_nops} for single invocation "
+        "timings."
+    )
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+class ImplType(Enum):
+    SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR = 1
+    REFERENCE = 2
+
+    def get_impl(self):
+        if self == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
+            return silu_mul_per_token_group_quant_fp8_colmajor
+        elif self == ImplType.REFERENCE:
+            return reference
+        raise ValueError(f"Unrecognized ImplType {self}")
+
+
+@dataclass
+class BenchmarkTensors:
+    input: torch.Tensor
+    output: torch.Tensor
+
+    # Reference act output tensor
+    ref_act_out: torch.Tensor
+    ref_quant_out: torch.Tensor
+
+    @staticmethod
+    def make(T: int, N: int) -> "BenchmarkTensors":
+        assert T % GROUP_SIZE == 0
+        assert N % (GROUP_SIZE * 2) == 0
+
+        input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
+
+        # silu_mul_per_token_group_quant_fp8_colmajor output.
+        output = torch.rand((T, N // 2), dtype=torch.bfloat16, device="cuda").to(
+            FLOAT8_T
+        )
+
+        # reference output.
+        ref_act_out = torch.empty((T, N // 2), dtype=torch.bfloat16, device="cuda")
+        ref_quant_out = torch.empty(
+            (T, N // 2), dtype=torch.bfloat16, device="cuda"
+        ).to(FLOAT8_T)
+
+        return BenchmarkTensors(
+            input=input,
+            output=output,
+            ref_act_out=ref_act_out,
+            ref_quant_out=ref_quant_out,
+        )
+
+    @property
+    def T(self):
+        return self.input.size(0)
+
+    @property
+    def N(self):
+        return self.input.size(1)
+
+    def make_impl_kwargs(self, impl_type: ImplType) -> dict[str, Any]:
+        if impl_type == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
+            return {
+                "input": self.input,
+                "output": self.output,
+                "use_ue8m0": is_deep_gemm_e8m0_used(),
+            }
+        elif impl_type == ImplType.REFERENCE:
+            return {
+                "input": self.input,
+                "act_out": self.ref_act_out,
+                "quant_out": self.ref_quant_out,
+                "use_ue8m0": is_deep_gemm_e8m0_used(),
+            }
+        raise ValueError(f"Unrecognized impl_type {impl_type}")
+
+
+def reference_quant(x: torch.Tensor, quant_out: torch.Tensor, use_ue8m0: bool):
+    """
+    Reference triton quant kernel from,
+    vllm.model_executor.layers.quantization.utils.fp8_utils
+    """
+    assert quant_out.size() == x.size()
+    # Allocate the scale tensor column-major format.
+    shape = (x.shape[-1] // GROUP_SIZE,) + x.shape[:-1]
+    x_q = quant_out
+    x_s = torch.empty(shape, device=x.device, dtype=torch.float32).permute(-1, -2)
+
+    M = x.numel() // GROUP_SIZE
+    N = GROUP_SIZE
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+    num_stages = 1
+
+    finfo = torch.finfo(FLOAT8_T)
+    fp8_min = finfo.min
+    fp8_max = finfo.max
+
+    _per_token_group_quant_fp8_colmajor[(M,)](
+        x,
+        x_q,
+        x_s,
+        GROUP_SIZE,
+        x.shape[1],
+        x.stride(0),
+        x_s.stride(1),
+        eps=1e-10,
+        fp8_min=fp8_min,
+        fp8_max=fp8_max,
+        use_ue8m0=use_ue8m0,
+        BLOCK=BLOCK,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    return x_q, x_s
+
+
+def reference(
+    input: torch.Tensor,
+    act_out: torch.Tensor,
+    quant_out: torch.Tensor,
+    use_ue8m0: bool,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    torch.ops._C.silu_and_mul(act_out, input)
+    return reference_quant(act_out, quant_out, use_ue8m0)
+
+
+def bench_impl(
+    bench_tensors: list[BenchmarkTensors], impl_type: ImplType
+) -> TMeasurement:
+    T = bench_tensors[0].T
+    N = bench_tensors[0].N
+
+    arg_pool_size = len(bench_tensors)
+    kwargs_list = [bt.make_impl_kwargs(impl_type) for bt in bench_tensors]
+
+    # warmup
+    for kwargs in kwargs_list:
+        impl_type.get_impl()(**kwargs)
+    torch.cuda.synchronize()
+
+    # Merge into a single kwargs and qualify arguments as ArgPool
+    kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
+    for _kwargs in kwargs_list:
+        for k, v in _kwargs.items():
+            kwargs[k].values.append(v)
+
+    cuda_graph_params = None
+    cuda_graph_params = CudaGraphBenchParams(arg_pool_size)
+    timer = None
+    with Bench(
+        cuda_graph_params,
+        "silu-mul-quant",
+        f"num_tokens={T}, N={N}",
+        impl_type.name,
+        impl_type.get_impl(),
+        **kwargs,
+    ) as bench:
+        timer = bench.run()
+    return timer
+
+
+def test_correctness(T: int, N: int):
+    print(f"Testing num_tokens={T}, N={N} ...")
+
+    bench_tensor = BenchmarkTensors.make(T, N)
+
+    def output_from_impl(impl: ImplType) -> tuple[torch.Tensor, torch.Tensor]:
+        return impl.get_impl()(**bench_tensor.make_impl_kwargs(impl))
+
+    # reference output
+    ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
+
+    # test ouptut
+    out_q, out_s = output_from_impl(
+        ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
+    )
+
+    torch.testing.assert_close(ref_out_q.to(torch.float32), out_q.to(torch.float32))
+    torch.testing.assert_close(ref_out_s, out_s)
+
+
+def run(Ts: list[int], Ns: list[int], arg_pool_size: int) -> list[TMeasurement]:
+    timers = []
+    for N, T in product(Ns, Ts):
+        test_correctness(T, N)
+
+        bench_tensors: list[BenchmarkTensors] = [
+            BenchmarkTensors.make(T, N) for _ in range(arg_pool_size)
+        ]
+
+        silu_mul_quant_timer = bench_impl(
+            bench_tensors, ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
+        )
+        timers.append(silu_mul_quant_timer)
+        reference_timer = bench_impl(bench_tensors, ImplType.REFERENCE)
+        timers.append(reference_timer)
+
+        print_timers(
+            [silu_mul_quant_timer, reference_timer], cuda_graph_nops=arg_pool_size
+        )
+
+    print_timers(timers, cuda_graph_nops=arg_pool_size)
+
+    return timers
+
+
+if __name__ == "__main__":
+    T = [128 * i for i in range(1, 16)] + [2048 * i for i in range(1, 65)]
+    N = [2048, 4096, 8192]
+
+    print(f"T = {T}, N = {N}")
+    run(T, N, arg_pool_size=8)
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -150,6 +150,97 @@ ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 #################### BASE BUILD IMAGE ####################

+#################### CSRC BUILD IMAGE ####################
+FROM base AS csrc-build
+ARG TARGETPLATFORM
+
+ARG PIP_INDEX_URL UV_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
+ARG PYTORCH_CUDA_INDEX_BASE_URL
+
+# install build dependencies
+COPY requirements/build.txt requirements/build.txt
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+# Use copy mode to avoid hardlink failures with Docker cache mounts
+ENV UV_LINK_MODE=copy
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
+    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+
+WORKDIR /workspace
+
+COPY pyproject.toml setup.py CMakeLists.txt ./
+COPY cmake cmake/
+COPY csrc csrc/
+COPY vllm/envs.py vllm/envs.py
+COPY vllm/__init__.py vllm/__init__.py
+
+# max jobs used by Ninja to build extensions
+ARG max_jobs=2
+ENV MAX_JOBS=${max_jobs}
+# number of threads used by nvcc
+ARG nvcc_threads=8
+ENV NVCC_THREADS=$nvcc_threads
+
+ARG USE_SCCACHE
+ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz
+ARG SCCACHE_ENDPOINT
+ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
+ARG SCCACHE_REGION_NAME=us-west-2
+ARG SCCACHE_S3_NO_CREDENTIALS=0
+
+# Flag to control whether to use pre-built vLLM wheels
+ARG VLLM_USE_PRECOMPILED=""
+ARG VLLM_MERGE_BASE_COMMIT=""
+ARG VLLM_MAIN_CUDA_VERSION=""
+
+# Use dummy version for csrc-build wheel (only .so files are extracted, version doesn't matter)
+ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build"
+
+# if USE_SCCACHE is set, use sccache to speed up compilation
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ "$USE_SCCACHE" = "1" ]; then \
+        echo "Installing sccache..." \
+        && curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
+        && tar -xzf sccache.tar.gz \
+        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
+        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
+        && if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
+        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
+        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
+        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
+        && export SCCACHE_IDLE_TIMEOUT=0 \
+        && export CMAKE_BUILD_TYPE=Release \
+        && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
+        && export VLLM_PRECOMPILED_WHEEL_COMMIT="${VLLM_MERGE_BASE_COMMIT}" \
+        && export VLLM_MAIN_CUDA_VERSION="${VLLM_MAIN_CUDA_VERSION}" \
+        && export VLLM_DOCKER_BUILD_CONTEXT=1 \
+        && sccache --show-stats \
+        && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
+        && sccache --show-stats; \
+    fi
+
+ARG vllm_target_device="cuda"
+ENV VLLM_TARGET_DEVICE=${vllm_target_device}
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/uv \
+    if [ "$USE_SCCACHE" != "1" ]; then \
+        # Clean any existing CMake artifacts
+        rm -rf .deps && \
+        mkdir -p .deps && \
+        export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
+        export VLLM_PRECOMPILED_WHEEL_COMMIT="${VLLM_MERGE_BASE_COMMIT}" && \
+        export VLLM_DOCKER_BUILD_CONTEXT=1 && \
+        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
+    fi
+#################### CSRC BUILD IMAGE ####################
+
 #################### WHEEL BUILD IMAGE ####################
 FROM base AS build
 ARG TARGETPLATFORM
@ -172,66 +263,28 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

+WORKDIR /workspace
+
+COPY --from=csrc-build /workspace/dist /precompiled-wheels
+
 COPY . .
+
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi

-# max jobs used by Ninja to build extensions
-ARG max_jobs=2
-ENV MAX_JOBS=${max_jobs}
-# number of threads used by nvcc
-ARG nvcc_threads=8
-ENV NVCC_THREADS=$nvcc_threads
-
-ARG USE_SCCACHE
-ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz
-ARG SCCACHE_ENDPOINT
-ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
-ARG SCCACHE_REGION_NAME=us-west-2
-ARG SCCACHE_S3_NO_CREDENTIALS=0
-
-# Flag to control whether to use pre-built vLLM wheels
-ARG VLLM_USE_PRECOMPILED=""
-ARG VLLM_MAIN_CUDA_VERSION=""
-
-# if USE_SCCACHE is set, use sccache to speed up compilation
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,source=.git,target=.git \
-    if [ "$USE_SCCACHE" = "1" ]; then \
-        echo "Installing sccache..." \
-        && curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
-        && tar -xzf sccache.tar.gz \
-        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
-        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
-        && if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
-        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
-        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
-        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
-        && export SCCACHE_IDLE_TIMEOUT=0 \
-        && export CMAKE_BUILD_TYPE=Release \
-        && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
-        && export VLLM_MAIN_CUDA_VERSION="${VLLM_MAIN_CUDA_VERSION}" \
-        && export VLLM_DOCKER_BUILD_CONTEXT=1 \
-        && sccache --show-stats \
-        && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
-        && sccache --show-stats; \
-    fi
-
 ARG vllm_target_device="cuda"
 ENV VLLM_TARGET_DEVICE=${vllm_target_device}
-ENV CCACHE_DIR=/root/.cache/ccache
-RUN --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,source=.git,target=.git  \
-    if [ "$USE_SCCACHE" != "1" ]; then \
-        # Clean any existing CMake artifacts
-        rm -rf .deps && \
-        mkdir -p .deps && \
-        export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
-        export VLLM_DOCKER_BUILD_CONTEXT=1 && \
-        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
-    fi
+
+# Skip adding +precompiled suffix to version (preserves git-derived version)
+ENV VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX=1
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=.git,target=.git \
+    if [ "${vllm_target_device}" = "cuda" ]; then \
+        export VLLM_PRECOMPILED_WHEEL_LOCATION=$(ls /precompiled-wheels/*.whl); \
+    fi && \
+    python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38

 # Install DeepGEMM from source
 ARG DEEPGEMM_GIT_REF
@ -527,7 +580,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    else \
        BITSANDBYTES_VERSION="0.46.1"; \
    fi; \
-    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.0'
+    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.3'

 ENV VLLM_USAGE_SOURCE production-docker-image

--- a/docs/assets/contributing/dockerfile-stages-dependency.png
+++ b/docs/assets/contributing/dockerfile-stages-dependency.png
--- a/docs/benchmarking/cli.md
+++ b/docs/benchmarking/cli.md
@ -670,6 +670,35 @@ vllm bench serve \

 </details>

+### 🧪 Hashing Benchmarks
+
+<details class="admonition abstract" markdown="1">
+<summary>Show more</summary>
+
+Two helper scripts live in `benchmarks/` to compare hashing options used by prefix caching and related utilities. They are standalone (no server required) and help choose a hash algorithm before enabling prefix caching in production.
+
+- `benchmarks/benchmark_hash.py`: Micro-benchmark that measures per-call latency of three implementations on a representative `(bytes, tuple[int])` payload.
+
+```bash
+python benchmarks/benchmark_hash.py --iterations 20000 --seed 42
+```
+
+- `benchmarks/benchmark_prefix_block_hash.py`: End-to-end block hashing benchmark that runs the full prefix-cache hash pipeline (`hash_block_tokens`) across many fake blocks and reports throughput.
+
+```bash
+python benchmarks/benchmark_prefix_block_hash.py --num-blocks 20000 --block-size 32 --trials 5
+```
+
+Supported algorithms: `sha256`, `sha256_cbor`, `xxhash`, `xxhash_cbor`. Install optional deps to exercise all variants:
+
+```bash
+uv pip install xxhash cbor2
+```
+
+If an algorithm’s dependency is missing, the script will skip it and continue.
+
+</details>
+
 ### ⚡ Request Prioritization Benchmark

 <details class="admonition abstract" markdown="1">
--- a/docs/community/sponsors.md
+++ b/docs/community/sponsors.md
@ -18,6 +18,7 @@ Compute Resources:
 - Alibaba Cloud
 - AMD
 - Anyscale
+- Arm
 - AWS
 - Crusoe Cloud
 - Databricks
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@ -57,15 +57,15 @@ vLLM also provides [a reference example](../../examples/online_serving/prometheu
 The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:

 - `vllm:e2e_request_latency_seconds_bucket` - End to end request latency measured in seconds.
- `vllm:prompt_tokens_total` - Prompt tokens.
- `vllm:generation_tokens_total` - Generation tokens.
+- `vllm:prompt_tokens` - Prompt tokens.
+- `vllm:generation_tokens` - Generation tokens.
 - `vllm:time_per_output_token_seconds` - Inter-token latency (Time Per Output Token, TPOT) in seconds.
 - `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds.
 - `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in the RUNNING, WAITING, and SWAPPED states.
 - `vllm:gpu_cache_usage_perc` - Percentage of used cache blocks by vLLM.
 - `vllm:request_prompt_tokens` - Request prompt length.
 - `vllm:request_generation_tokens` - Request generation length.
- `vllm:request_success_total` - Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.
+- `vllm:request_success` - Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.
 - `vllm:request_queue_time_seconds` - Queue time.
 - `vllm:request_prefill_time_seconds` - Requests prefill time.
 - `vllm:request_decode_time_seconds` - Requests decode time.
@ -571,9 +571,9 @@ model and then validate those tokens with the larger model.

 - `vllm:spec_decode_draft_acceptance_rate` (Gauge)
 - `vllm:spec_decode_efficiency` (Gauge)
- `vllm:spec_decode_num_accepted_tokens_total` (Counter)
- `vllm:spec_decode_num_draft_tokens_total` (Counter)
- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
+- `vllm:spec_decode_num_accepted_tokens` (Counter)
+- `vllm:spec_decode_num_draft_tokens` (Counter)
+- `vllm:spec_decode_num_emitted_tokens` (Counter)

 There is a PR under review (<https://github.com/vllm-project/vllm/pull/12193>) to add "prompt lookup (ngram)"
 speculative decoding to v1. Other techniques will follow. We should
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@ -90,7 +90,6 @@ To be used with a particular `FusedMoEPrepareAndFinalize` subclass, MoE kernels
 | cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`cutlass_moe_fp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp8],</br>[`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] |
 | flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
 | gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
-| deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup> | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] |
 | marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
 | trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
 | pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] |
@ -114,5 +113,5 @@ The following table shows "families" of modular kernels that are intended to wor
 | backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses |
 |---------|-----------------------------------------|----------------------------------------------|
 | deepep_high_throughput | `DeepEPHTPrepareAndFinalize` |  `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
-| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` |  `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
+| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` |  `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
 | flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |
--- a/docs/features/README.md
+++ b/docs/features/README.md
@ -54,7 +54,7 @@ th:not(:first-child) {
 | beam-search | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/7968) | ❔ | ✅ | ✅ | |
 | [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❔ | ❔ | ❌ | ❔ | ❔ | ✅ |

-\* Chunked prefill and prefix caching are only applicable to last-token pooling.  
+\* Chunked prefill and prefix caching are only applicable to last-token or all pooling with causal attention.  
 <sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.

 ### Feature x Hardware
--- a/docs/features/mooncake_connector_usage.md
+++ b/docs/features/mooncake_connector_usage.md
@ -0,0 +1,58 @@
+# MooncakeConnector Usage Guide
+
+## About Mooncake
+
+Mooncake aims to enhance the inference efficiency of large language models (LLMs), especially in slow object storage environments, by constructing a multi-level caching pool on high-speed interconnected DRAM/SSD resources. Compared to traditional caching systems, Mooncake utilizes (GPUDirect) RDMA technology to transfer data directly in a zero-copy manner, while maximizing the use of multi-NIC resources on a single machine.
+
+For more details about Mooncake, please refer to [Mooncake project](https://github.com/kvcache-ai/Mooncake) and [Mooncake documents](https://kvcache-ai.github.io/Mooncake/).
+
+## Prerequisites
+
+### Installation
+
+Install mooncake through pip: `uv pip install mooncake-transfer-engine`.
+
+Refer to [Mooncake official repository](https://github.com/kvcache-ai/Mooncake) for more installation instructions
+
+## Usage
+
+### Prefiller Node (192.168.0.2)
+
+```bash
+vllm serve Qwen/Qwen2.5-7B-Instruct --port 8010 --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_producer"}'
+```
+
+### Decoder Node (192.168.0.3)
+
+```bash
+vllm serve Qwen/Qwen2.5-7B-Instruct --port 8020 --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_consumer"}'
+```
+
+### Proxy
+
+```bash
+python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --prefiller-host 192.168.0.2 --prefiller-port 8010 --decoder-host 192.168.0.3 --decoder-port 8020
+```
+
+> NOTE: The Mooncake Connector currently uses the proxy from nixl_integration. This will be replaced with a self-developed proxy in the future.
+
+Now you can send requests to the proxy server through port 8000.
+
+## Environment Variables
+
+- `VLLM_MOONCAKE_BOOTSTRAP_PORT`: Port for Mooncake bootstrap server
+    - Default: 8998
+    - Required only for prefiller instances
+    - Each vLLM worker needs a unique port on its host; using the same port number across different hosts is fine
+    - For TP/DP deployments, each worker's port on a node is computed as: base_port + dp_rank * tp_size + tp_rank
+    - Used for the decoder notifying the prefiller
+
+- `VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT`: Timeout (in seconds) for automatically releasing the prefiller’s KV cache for a particular request. (Optional)
+    - Default: 480
+    - If a request is aborted and the decoder has not yet notified the prefiller, the prefill instance will release its KV-cache blocks after this timeout to avoid holding them indefinitely.
+
+## KV Role Options
+
+- **kv_producer**: For prefiller instances that generate KV caches
+- **kv_consumer**: For decoder instances that consume KV caches from prefiller
+- **kv_both**: Enables symmetric functionality where the connector can act as both producer and consumer. This provides flexibility for experimental setups and scenarios where the role distinction is not predetermined.
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@ -795,14 +795,12 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
 ??? code

    ```python
+    from vllm.utils.serial_utils import tensor2base64
+
    image_embedding = torch.load(...)
    grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct

-    buffer = io.BytesIO()
-    torch.save(image_embedding, buffer)
-    buffer.seek(0)
-    binary_data = buffer.read()
-    base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
+    base64_image_embedding = tensor2base64(image_embedding)

    client = OpenAI(
        # defaults to os.environ.get("OPENAI_API_KEY")
--- a/docs/getting_started/installation/cpu.apple.inc.md
+++ b/docs/getting_started/installation/cpu.apple.inc.md
@ -4,9 +4,6 @@ vLLM has experimental support for macOS with Apple Silicon. For now, users must

 Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.

-!!! warning
-    There are no pre-built wheels or images for this device, so you must build vLLM from source.
-
 # --8<-- [end:installation]
 # --8<-- [start:requirements]

@ -20,6 +17,8 @@ Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
 # --8<-- [end:set-up-using-python]
 # --8<-- [start:pre-built-wheels]

+Currently, there are no pre-built Apple silicon CPU wheels.
+
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]

@ -78,6 +77,8 @@ uv pip install -e .
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]

+Currently, there are no pre-built Arm silicon CPU images.
+
 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]

--- a/docs/getting_started/installation/cpu.arm.inc.md
+++ b/docs/getting_started/installation/cpu.arm.inc.md
@ -1,11 +1,6 @@
 # --8<-- [start:installation]

-vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform.
-
-ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
-
-!!! warning
-    There are no pre-built wheels or images for this device, so you must build vLLM from source.
+vLLM offers basic model inferencing and serving on Arm CPU platform, with support NEON, data types FP32, FP16 and BF16.

 # --8<-- [end:installation]
 # --8<-- [start:requirements]
@ -20,6 +15,23 @@ ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
 # --8<-- [end:set-up-using-python]
 # --8<-- [start:pre-built-wheels]

+Pre-built vLLM wheels for Arm are available since version 0.11.2. These wheels contain pre-compiled C++ binaries.
+Please replace `<version>` in the commands below with a specific version string (e.g., `0.11.2`).
+
+```bash
+uv pip install --pre vllm==<version>+cpu --extra-index-url https://wheels.vllm.ai/<version>%2Bcpu/
+```
+
+??? console "pip"
+    ```bash
+    pip install --pre vllm==<version>+cpu --extra-index-url https://wheels.vllm.ai/<version>%2Bcpu/
+    ```
+
+The `uv` approach works for vLLM `v0.6.6` and later. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
+
+!!! note
+    Nightly wheels are currently unsupported for this architecture. (e.g. to bisect the behavior change, performance regression).
+
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]

@ -69,6 +81,8 @@ Testing has been conducted on AWS Graviton3 instances for compatibility.
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]

+Currently, there are no pre-built Arm CPU images.
+
 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]
 ```bash
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@ -46,11 +46,25 @@ vLLM is a Python library that supports the following CPU variants. Select your C

 ### Pre-built wheels

-Please refer to the instructions for [pre-built wheels on GPU](./gpu.md#pre-built-wheels).
-
 When specifying the index URL, please make sure to use the `cpu` variant subdirectory.
 For example, the nightly build index is: `https://wheels.vllm.ai/nightly/cpu/`.

+=== "Intel/AMD x86"
+
+    --8<-- "docs/getting_started/installation/cpu.x86.inc.md:pre-built-wheels"
+
+=== "ARM AArch64"
+
+    --8<-- "docs/getting_started/installation/cpu.arm.inc.md:pre-built-wheels"
+
+=== "Apple silicon"
+
+    --8<-- "docs/getting_started/installation/cpu.apple.inc.md:pre-built-wheels"
+
+=== "IBM Z (S390X)"
+
+    --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:pre-built-wheels"
+
 ### Build wheel from source

 #### Set up using Python-only build (without compilation) {#python-only-build}
@ -87,6 +101,18 @@ VLLM_USE_PRECOMPILED=1 VLLM_PRECOMPILED_WHEEL_VARIANT=cpu VLLM_TARGET_DEVICE=cpu

    --8<-- "docs/getting_started/installation/cpu.x86.inc.md:pre-built-images"

+=== "ARM AArch64"
+
+    --8<-- "docs/getting_started/installation/cpu.arm.inc.md:pre-built-images"
+
+=== "Apple silicon"
+
+    --8<-- "docs/getting_started/installation/cpu.apple.inc.md:pre-built-images"
+
+=== "IBM Z (S390X)"
+
+    --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:pre-built-images"
+
 ### Build image from source

 === "Intel/AMD x86"
--- a/docs/getting_started/installation/cpu.s390x.inc.md
+++ b/docs/getting_started/installation/cpu.s390x.inc.md
@ -4,9 +4,6 @@ vLLM has experimental support for s390x architecture on IBM Z platform. For now,

 Currently, the CPU implementation for s390x architecture supports FP32 datatype only.

-!!! warning
-    There are no pre-built wheels or images for this device, so you must build vLLM from source.
-
 # --8<-- [end:installation]
 # --8<-- [start:requirements]

@ -21,6 +18,8 @@ Currently, the CPU implementation for s390x architecture supports FP32 datatype
 # --8<-- [end:set-up-using-python]
 # --8<-- [start:pre-built-wheels]

+Currently, there are no pre-built IBM Z CPU wheels.
+
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]

@ -69,6 +68,8 @@ Execute the following commands to build and install vLLM from source.
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]

+Currently, there are no pre-built IBM Z CPU images.
+
 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]

--- a/docs/getting_started/installation/cpu.x86.inc.md
+++ b/docs/getting_started/installation/cpu.x86.inc.md
@ -17,6 +17,8 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
 # --8<-- [end:set-up-using-python]
 # --8<-- [start:pre-built-wheels]

+Currently, there are no pre-built x86 CPU wheels.
+
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]

--- a/docs/getting_started/installation/gpu.rocm.inc.md
+++ b/docs/getting_started/installation/gpu.rocm.inc.md
@ -5,9 +5,6 @@ vLLM supports AMD GPUs with ROCm 6.3 or above, and torch 2.8.0 and above.
 !!! tip
    [Docker](#set-up-using-docker) is the recommended way to use vLLM on ROCm.

-!!! warning
-    There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
-
 # --8<-- [end:installation]
 # --8<-- [start:requirements]

--- a/docs/getting_started/installation/gpu.xpu.inc.md
+++ b/docs/getting_started/installation/gpu.xpu.inc.md
@ -2,9 +2,6 @@

 vLLM initially supports basic model inference and serving on Intel GPU platform.

-!!! warning
-    There are no pre-built wheels for this device, so you need build vLLM from source. Or you can use pre-built images which are based on vLLM released versions.
-
 # --8<-- [end:installation]
 # --8<-- [start:requirements]

--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -711,7 +711,6 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ |
 | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
 | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |
-| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ |
 | `PixtralForConditionalGeneration` | Ministral 3 (Mistral format), Mistral 3 (Mistral format), Mistral Large 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Mistral-Large-3-675B-Instruct-2512` `mistralai/Pixtral-12B-2409` etc. | | ✅︎ |
 | `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ |
 | `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ |
--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
@ -23,31 +23,23 @@ def create_test_prompts(
        # this is an example of using quantization without LoRA
        (
            "My name is",
-            SamplingParams(
-                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
-            ),
+            SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
            None,
        ),
        # the next three examples use quantization with LoRA
        (
            "my name is",
-            SamplingParams(
-                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
-            ),
+            SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
            LoRARequest("lora-test-1", 1, lora_path),
        ),
        (
            "The capital of USA is",
-            SamplingParams(
-                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
-            ),
+            SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
            LoRARequest("lora-test-2", 1, lora_path),
        ),
        (
            "The capital of France is",
-            SamplingParams(
-                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
-            ),
+            SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
            LoRARequest("lora-test-3", 1, lora_path),
        ),
    ]
--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@ -27,9 +27,7 @@ def create_test_prompts(
    return [
        (
            "A robot may not injure a human being",
-            SamplingParams(
-                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
-            ),
+            SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
            None,
        ),
        (
@ -41,22 +39,12 @@ def create_test_prompts(
        ),
        (
            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
-            SamplingParams(
-                temperature=0.0,
-                logprobs=1,
-                prompt_logprobs=1,
-                max_tokens=128,
-            ),
+            SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
            LoRARequest("sql-lora", 1, lora_path),
        ),
        (
            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
-            SamplingParams(
-                temperature=0.0,
-                logprobs=1,
-                prompt_logprobs=1,
-                max_tokens=128,
-            ),
+            SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
            LoRARequest("sql-lora2", 2, lora_path),
        ),
    ]
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@ -309,6 +309,28 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
    )


+# HunyuanOCR
+def load_hunyuan_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "tencent/HunyuanOCR"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholder = (
+        "<｜hy_place▁holder▁no▁100｜><｜hy_place▁holder▁no▁102｜><｜hy_place▁holder▁no▁101｜>"  # noqa: E501
+    ) * len(image_urls)
+    prompt = f"<｜hy_begin▁of▁sentence｜>{placeholder}{question}<｜hy_User｜>"
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_hyperclovax_seed_vision(
    question: str, image_urls: list[str]
 ) -> ModelRequestData:
@ -1322,6 +1344,7 @@ model_example_map = {
    "deepseek_ocr": load_deepseek_ocr,
    "gemma3": load_gemma3,
    "h2ovl_chat": load_h2ovl,
+    "hunyuan_vl": load_hunyuan_vl,
    "hyperclovax_seed_vision": load_hyperclovax_seed_vision,
    "idefics3": load_idefics3,
    "interns1": load_interns1,
--- a/examples/online_serving/prompt_embed_inference_with_openai_client.py
+++ b/examples/online_serving/prompt_embed_inference_with_openai_client.py
@ -28,13 +28,11 @@ Dependencies:
 - openai
 """

-import base64
-import io
-
-import torch
 import transformers
 from openai import OpenAI

+from vllm.utils.serial_utils import tensor2base64
+

 def main():
    client = OpenAI(
@ -58,11 +56,7 @@ def main():
    prompt_embeds = embedding_layer(token_ids).squeeze(0)

    # Prompt embeddings
-    buffer = io.BytesIO()
-    torch.save(prompt_embeds, buffer)
-    buffer.seek(0)
-    binary_data = buffer.read()
-    encoded_embeds = base64.b64encode(binary_data).decode("utf-8")
+    encoded_embeds = tensor2base64(prompt_embeds)

    completion = client.completions.create(
        model=model_name,
--- a/examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py
@ -150,7 +150,8 @@ def run_siglip(client: OpenAI, model: str):
    Start the server using:

    vllm serve google/siglip-base-patch16-224 \
-        --runner pooling
+        --runner pooling \
+        --chat-template template_basic.jinja
    """

    response = create_chat_embeddings(
--- a/requirements/common.txt
+++ b/requirements/common.txt
@ -46,6 +46,7 @@ scipy # Required for phi-4-multimodal-instruct
 ninja # Required for xgrammar, rocm, tpu, xpu
 pybase64 # fast base64 implementation
 cbor2 # Required for cross-language serialization of hashable objects
+ijson # Required for mistral streaming tool parser
 setproctitle # Used to set process names for better debugging and monitoring
 openai-harmony >= 0.0.3  # Required for gpt-oss
 anthropic == 0.71.0
--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
@ -3,7 +3,6 @@ ninja
 packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
--extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
 torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"
 scons; platform_machine == "aarch64"    # needed to build Arm Compute Library (ACL)
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@ -4,7 +4,6 @@
 numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative decoding

 # Dependencies for CPUs
--extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
 torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"

--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@ -42,6 +42,6 @@ tritonclient==2.51.0

 numba == 0.61.2 # Required for N-gram speculative decoding
 numpy
-runai-model-streamer[s3,gcs]==0.15.0
+runai-model-streamer[s3,gcs]==0.15.3
 fastsafetensors>=0.1.10
 pydantic>=2.12 # 2.11 leads to error on python 3.13
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@ -12,7 +12,7 @@ tensorizer==2.10.1
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
-runai-model-streamer[s3,gcs]==0.15.0
+runai-model-streamer[s3,gcs]==0.15.3
 conch-triton-kernels==1.2.1
 timm>=1.0.17
 fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459
--- a/requirements/test.in
+++ b/requirements/test.in
@ -51,7 +51,7 @@ tritonclient==2.51.0
 arctic-inference == 0.1.1 # Required for suffix decoding test
 numba == 0.61.2 # Required for N-gram speculative decoding
 numpy
-runai-model-streamer[s3,gcs]==0.15.0
+runai-model-streamer[s3,gcs]==0.15.3
 fastsafetensors>=0.1.10
 pydantic>=2.12 # 2.11 leads to error on python 3.13
 decord==0.6.0
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -965,11 +965,11 @@ rsa==4.9.1
    # via google-auth
 rtree==1.4.0
    # via torchgeo
-runai-model-streamer==0.15.0
+runai-model-streamer==0.15.3
    # via -r requirements/test.in
-runai-model-streamer-gcs==0.15.0
+runai-model-streamer-gcs==0.15.3
    # via runai-model-streamer
-runai-model-streamer-s3==0.15.0
+runai-model-streamer-s3==0.15.3
    # via runai-model-streamer
 s3transfer==0.10.3
    # via boto3
--- a/setup.py
+++ b/setup.py
@ -346,10 +346,13 @@ class precompiled_wheel_utils:
        The order of preference is:
        1. user-specified wheel location (can be either local or remote, via
           VLLM_PRECOMPILED_WHEEL_LOCATION)
-        2. user-specified variant from nightly repo (current main commit via
-           VLLM_PRECOMPILED_WHEEL_VARIANT)
+        2. user-specified variant (VLLM_PRECOMPILED_WHEEL_VARIANT) from nightly repo
        3. the variant corresponding to VLLM_MAIN_CUDA_VERSION from nightly repo
-        4. the default variant from nightly repo (current main commit)
+        4. the default variant from nightly repo
+
+        If downloading from the nightly repo, the commit can be specified via
+        VLLM_PRECOMPILED_WHEEL_COMMIT; otherwise, the head commit in the main branch
+        is used.
        """
        wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
        if wheel_location is not None:
@ -362,10 +365,13 @@ class precompiled_wheel_utils:
            # try to fetch the wheel metadata from the nightly wheel repo
            main_variant = "cu" + envs.VLLM_MAIN_CUDA_VERSION.replace(".", "")
            variant = os.getenv("VLLM_PRECOMPILED_WHEEL_VARIANT", main_variant)
-            commit = os.getenv(
-                "VLLM_PRECOMPILED_WHEEL_COMMIT",
-                precompiled_wheel_utils.get_base_commit_in_main_branch(),
-            )
+            commit = os.getenv("VLLM_PRECOMPILED_WHEEL_COMMIT", "").lower()
+            if not commit or len(commit) != 40:
+                print(
+                    f"VLLM_PRECOMPILED_WHEEL_COMMIT not valid: {commit}"
+                    ", trying to fetch base commit in main branch"
+                )
+                commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
            print(f"Using precompiled wheel commit {commit} with variant {variant}")
            try_default = False
            wheels, repo_url, download_filename = None, None, None
@ -461,14 +467,22 @@ class precompiled_wheel_utils:
                    "vllm/cumem_allocator.abi3.so",
                ]

-                compiled_regex = re.compile(
+                flash_attn_regex = re.compile(
                    r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
                )
+                triton_kernels_regex = re.compile(
+                    r"vllm/third_party/triton_kernels/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
+                )
                file_members = list(
                    filter(lambda x: x.filename in files_to_copy, wheel.filelist)
                )
                file_members += list(
-                    filter(lambda x: compiled_regex.match(x.filename), wheel.filelist)
+                    filter(lambda x: flash_attn_regex.match(x.filename), wheel.filelist)
+                )
+                file_members += list(
+                    filter(
+                        lambda x: triton_kernels_regex.match(x.filename), wheel.filelist
+                    )
                )

                for file in file_members:
@ -494,10 +508,6 @@ class precompiled_wheel_utils:

    @staticmethod
    def get_base_commit_in_main_branch() -> str:
-        # Force to use the nightly wheel. This is mainly used for CI testing.
-        if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL:
-            return "nightly"
-
        try:
            # Get the latest commit hash of the upstream main branch.
            resp_json = subprocess.check_output(
@ -508,6 +518,7 @@ class precompiled_wheel_utils:
                ]
            ).decode("utf-8")
            upstream_main_commit = json.loads(resp_json)["sha"]
+            print(f"Upstream main branch latest commit: {upstream_main_commit}")

            # In Docker build context, .git may be immutable or missing.
            if envs.VLLM_DOCKER_BUILD_CONTEXT:
@ -648,7 +659,7 @@ def get_vllm_version() -> str:
        if envs.VLLM_TARGET_DEVICE == "empty":
            version += f"{sep}empty"
    elif _is_cuda():
-        if envs.VLLM_USE_PRECOMPILED:
+        if envs.VLLM_USE_PRECOMPILED and not envs.VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX:
            version += f"{sep}precompiled"
        else:
            cuda_version = str(get_nvcc_cuda_version())
@ -786,7 +797,7 @@ setup(
        "bench": ["pandas", "matplotlib", "seaborn", "datasets"],
        "tensorizer": ["tensorizer==2.10.1"],
        "fastsafetensors": ["fastsafetensors >= 0.1.10"],
-        "runai": ["runai-model-streamer[s3,gcs] >= 0.15.0"],
+        "runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"],
        "audio": [
            "librosa",
            "soundfile",
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@ -392,39 +392,48 @@ def test_pass_config_deprecation(caplog_vllm):
    assert "enable_fusion is deprecated" in caplog_vllm.text
    assert config.fuse_norm_quant is True
    assert config.fuse_act_quant is True
-    assert config.enable_fusion is None
+    assert config.enable_fusion is True

    # Test enable_attn_fusion -> fuse_attn_quant
    caplog_vllm.clear()
    config = PassConfig(enable_attn_fusion=True)
    assert "enable_attn_fusion is deprecated" in caplog_vllm.text
    assert config.fuse_attn_quant is True
-    assert config.enable_attn_fusion is None
+    assert config.enable_attn_fusion is True

    # Test enable_noop -> eliminate_noops
    caplog_vllm.clear()
    config = PassConfig(enable_noop=True)
    assert "enable_noop is deprecated" in caplog_vllm.text
    assert config.eliminate_noops is True
-    assert config.enable_noop is None
+    assert config.enable_noop is True

    # Test enable_sequence_parallelism -> enable_sp
    caplog_vllm.clear()
    config = PassConfig(enable_sequence_parallelism=True)
    assert "enable_sequence_parallelism is deprecated" in caplog_vllm.text
    assert config.enable_sp is True
-    assert config.enable_sequence_parallelism is None
+    assert config.enable_sequence_parallelism is True

    # Test enable_async_tp -> fuse_gemm_comms
    caplog_vllm.clear()
    config = PassConfig(enable_async_tp=True)
    assert "enable_async_tp is deprecated" in caplog_vllm.text
    assert config.fuse_gemm_comms is True
-    assert config.enable_async_tp is None
+    assert config.enable_async_tp is True

    # Test enable_fi_allreduce_fusion -> fuse_allreduce_rms
    caplog_vllm.clear()
    config = PassConfig(enable_fi_allreduce_fusion=True)
    assert "enable_fi_allreduce_fusion is deprecated" in caplog_vllm.text
    assert config.fuse_allreduce_rms is True
-    assert config.enable_fi_allreduce_fusion is None
+    assert config.enable_fi_allreduce_fusion is True
+
+    # Test hash consistency
+    config_old = PassConfig(enable_fusion=True)
+    config_new = PassConfig(fuse_norm_quant=True, fuse_act_quant=True)
+    assert config_old.compute_hash() == config_new.compute_hash()
+
+    config_old = PassConfig(enable_async_tp=True)
+    config_new = PassConfig(fuse_gemm_comms=True)
+    assert config_old.compute_hash() == config_new.compute_hash()
--- a/tests/distributed/test_eplb_spec_decode.py
+++ b/tests/distributed/test_eplb_spec_decode.py
@ -6,6 +6,7 @@ import lm_eval
 import pytest

 from tests.utils import large_gpu_mark
+from vllm.platforms import current_platform


 def get_model_args(
@ -45,6 +46,12 @@ def get_model_args(
    return model_args


+pytestmark = pytest.mark.skipif(
+    current_platform.is_rocm(),
+    reason="EPLB with Spec Decode is a work in progress on ROCm.",
+)
+
+
@pytest.mark.parametrize(
    "model_setup",
    [
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@ -232,7 +232,7 @@ async def test_server_load(server: RemoteOpenAIServer):
@pytest.mark.asyncio
 async def test_health_check_engine_dead_error():
    # Import the health function directly to test it in isolation
-    from vllm.entrypoints.openai.api_server import health
+    from vllm.entrypoints.serve.instrumentator.health import health

    # Create a mock request that simulates what FastAPI would provide
    mock_request = Mock(spec=Request)
--- a/tests/entrypoints/openai/test_messages.py
+++ b/tests/entrypoints/openai/test_messages.py
@ -69,9 +69,20 @@ async def test_anthropic_streaming(client: anthropic.AsyncAnthropic):
        stream=True,
    )

+    first_chunk = None
+    chunk_count = 0
    async for chunk in resp:
+        chunk_count += 1
+        if first_chunk is None and chunk.type == "message_start":
+            first_chunk = chunk
        print(chunk.model_dump_json())

+    assert chunk_count > 0
+    assert first_chunk is not None, "message_start chunk was never observed"
+    assert first_chunk.usage is not None, "first chunk should include usage stats"
+    assert first_chunk.usage["output_tokens"] == 0
+    assert first_chunk.usage["input_tokens"] > 5
+

@pytest.mark.asyncio
 async def test_anthropic_tool_call(client: anthropic.AsyncAnthropic):
--- a/tests/entrypoints/openai/test_response_api_simple.py
+++ b/tests/entrypoints/openai/test_response_api_simple.py
@ -42,6 +42,24 @@ async def test_basic(client: OpenAI, model_name: str):
    assert response.status == "completed"


+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_enable_response_messages(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="Hello?",
+        extra_body={"enable_response_messages": True},
+    )
+    assert response.status == "completed"
+    assert response.input_messages[0]["type"] == "raw_message_tokens"
+    assert type(response.input_messages[0]["message"]) is str
+    assert len(response.input_messages[0]["message"]) > 10
+    assert type(response.input_messages[0]["tokens"][0]) is int
+    assert type(response.output_messages[0]["message"]) is str
+    assert len(response.output_messages[0]["message"]) > 10
+    assert type(response.output_messages[0]["tokens"][0]) is int
+
+
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_reasoning_item(client: OpenAI, model_name: str):
--- a/tests/entrypoints/openai/test_vision_embeds.py
+++ b/tests/entrypoints/openai/test_vision_embeds.py
@ -2,64 +2,47 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import base64
-import io

 import numpy as np
 import pytest
 import requests
 import torch

+from vllm.utils.serial_utils import tensor2base64
+
 from ...utils import RemoteOpenAIServer

-MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
-DTYPE = "float16"

-
-def _terratorch_dummy_inputs(model_name: str):
+def _terratorch_dummy_messages():
    pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
    location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)

-    buffer_tiff = io.BytesIO()
-    torch.save(pixel_values, buffer_tiff)
-    buffer_tiff.seek(0)
-    binary_data = buffer_tiff.read()
-    base64_tensor_embedding = base64.b64encode(binary_data).decode("utf-8")
-
-    buffer_coord = io.BytesIO()
-    torch.save(location_coords, buffer_coord)
-    buffer_coord.seek(0)
-    binary_data = buffer_coord.read()
-    base64_coord_embedding = base64.b64encode(binary_data).decode("utf-8")
-
-    return {
-        "model": model_name,
-        "additional_data": {"prompt_token_ids": [1]},
-        "encoding_format": "base64",
-        "messages": [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "image_embeds",
-                        "image_embeds": {
-                            "pixel_values": base64_tensor_embedding,
-                            "location_coords": base64_coord_embedding,
-                        },
-                    }
-                ],
-            }
-        ],
-    }
+    return [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_embeds",
+                    "image_embeds": {
+                        "pixel_values": tensor2base64(pixel_values),
+                        "location_coords": tensor2base64(location_coords),
+                    },
+                }
+            ],
+        }
+    ]


-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_single_request(model_name: str):
+@pytest.mark.parametrize(
+    "model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
+)
+def test_single_request(model_name: str):
    args = [
        "--runner",
        "pooling",
        # use half precision for speed and memory savings in CI environment
        "--dtype",
-        DTYPE,
+        "float16",
        "--enforce-eager",
        "--trust-remote-code",
        "--max-num-seqs",
@ -70,11 +53,15 @@ async def test_single_request(model_name: str):
        "--enable-mm-embeds",
    ]

-    with RemoteOpenAIServer(MODEL_NAME, args) as server:
-        prompt = _terratorch_dummy_inputs(model_name)
-
-        # test single pooling
-        response = requests.post(server.url_for("pooling"), json=prompt)
+    with RemoteOpenAIServer(model_name, args) as server:
+        response = requests.post(
+            server.url_for("pooling"),
+            json={
+                "model": model_name,
+                "messages": _terratorch_dummy_messages(),
+                "encoding_format": "base64",
+            },
+        )
        response.raise_for_status()

        output = response.json()["data"][0]["data"]
--- a/tests/entrypoints/pooling/classify/test_offline.py
+++ b/tests/entrypoints/pooling/classify/test_offline.py
@ -61,11 +61,8 @@ def test_pooling_params(llm: LLM):


@pytest.mark.skip_global_cleanup
-def test_encode_api(llm: LLM):
-    # chunked prefill does not support all pooling
-    err_msg = "pooling_task must be one of.+"
-    with pytest.raises(ValueError, match=err_msg):
-        llm.encode(prompts, pooling_task="token_classify", use_tqdm=False)
+def test_token_classify(llm: LLM):
+    llm.encode(prompts, pooling_task="token_classify", use_tqdm=False)


 def test_score_api(llm: LLM):
--- a/tests/entrypoints/pooling/classify/test_online.py
+++ b/tests/entrypoints/pooling/classify/test_online.py
@ -255,21 +255,21 @@ async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str):
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str):
-    # token_classify uses ALL pooling, which does not support chunked prefill.
    task = "token_classify"
+    input_text = ["This product was excellent and exceeded my expectations"]
    response = requests.post(
        server.url_for("pooling"),
        json={
            "model": model_name,
-            "input": "test",
+            "input": input_text,
            "encoding_format": "float",
            "task": task,
        },
    )
-    assert response.json()["error"]["type"] == "BadRequestError"
-    assert response.json()["error"]["message"].startswith(
-        f"Task {task} is not supported"
-    )
+    poolings = PoolingResponse.model_validate(response.json())
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 8
+    assert len(poolings.data[0].data[0]) == 2


@pytest.mark.asyncio
--- a/tests/entrypoints/pooling/embed/test_offline.py
+++ b/tests/entrypoints/pooling/embed/test_offline.py
@ -42,7 +42,7 @@ def llm():


@pytest.mark.skip_global_cleanup
-def test_encode_api(llm: LLM):
+def test_token_embed(llm: LLM):
    outputs = llm.encode(prompts, pooling_task="token_embed", use_tqdm=False)
    multi_vector = outputs[0].outputs.data
    assert multi_vector.shape == (11, 384)
--- a/tests/entrypoints/pooling/reward/test_offline.py
+++ b/tests/entrypoints/pooling/reward/test_offline.py
@ -36,6 +36,13 @@ def llm():
    cleanup_dist_env_and_memory()


+@pytest.mark.skip_global_cleanup
+def test_config(llm: LLM):
+    vllm_config = llm.llm_engine.vllm_config
+    assert vllm_config.cache_config.enable_prefix_caching
+    assert vllm_config.scheduler_config.enable_chunked_prefill
+
+
 def test_pooling_params(llm: LLM):
    def get_outputs(use_activation):
        outputs = llm.reward(
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@ -29,6 +29,7 @@ from vllm.multimodal.utils import (
    encode_video_base64,
 )
 from vllm.tokenizers import MistralTokenizer, get_tokenizer
+from vllm.utils.serial_utils import tensor2base64

 from ..models.registry import HF_EXAMPLE_MODELS
 from ..utils import VLLM_PATH
@ -85,11 +86,6 @@ def phi3v_model_config_image_embeds():
    )


-@pytest.fixture(scope="module")
-def phi3v_tokenizer():
-    return get_tokenizer(PHI3V_MODEL_ID)
-
-
@pytest.fixture(scope="function")
 def qwen2_audio_model_config():
    return ModelConfig(
@ -115,11 +111,6 @@ def audio_embeds_model_config():
    )


-@pytest.fixture(scope="module")
-def qwen2_audio_tokenizer():
-    return get_tokenizer(QWEN2AUDIO_MODEL_ID)
-
-
@pytest.fixture(scope="function")
 def qwen25omni_model_config_mm_interleaved():
    return ModelConfig(
@ -134,11 +125,6 @@ def qwen25omni_model_config_mm_interleaved():
    )


-@pytest.fixture(scope="module")
-def qwen25omni_tokenizer():
-    return get_tokenizer(QWEN25OMNI_MODEL_ID)
-
-
@pytest.fixture(scope="function")
 def mistral_model_config():
    return ModelConfig(
@ -150,11 +136,6 @@ def mistral_model_config():
    )


-@pytest.fixture(scope="module")
-def mistral_tokenizer():
-    return get_tokenizer(MISTRAL_MODEL_ID)
-
-
@pytest.fixture(scope="module")
 def image_url():
    image = ImageAsset("cherry_blossom")
@ -239,7 +220,6 @@ def _assert_mm_data_inputs(

 def test_parse_chat_messages_single_image(
    phi3v_model_config,
-    phi3v_tokenizer,
    image_url,
 ):
    conversation, mm_data, mm_uuids = parse_chat_messages(
@ -253,7 +233,6 @@ def test_parse_chat_messages_single_image(
            }
        ],
        phi3v_model_config,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -266,7 +245,6 @@ def test_parse_chat_messages_single_image(

 def test_parse_chat_messages_single_image_with_uuid(
    phi3v_model_config,
-    phi3v_tokenizer,
    image_url,
 ):
    image_uuid = str(hash(image_url))
@ -287,7 +265,6 @@ def test_parse_chat_messages_single_image_with_uuid(
            }
        ],
        phi3v_model_config,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -300,7 +277,6 @@ def test_parse_chat_messages_single_image_with_uuid(

 def test_parse_chat_messages_single_empty_image_with_uuid(
    phi3v_model_config,
-    phi3v_tokenizer,
    image_url,
 ):
    image_uuid = str(hash(image_url))
@ -319,7 +295,6 @@ def test_parse_chat_messages_single_empty_image_with_uuid(
            }
        ],
        phi3v_model_config,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -332,7 +307,6 @@ def test_parse_chat_messages_single_empty_image_with_uuid(

 def test_parse_chat_messages_single_image_with_bad_uuid_format(
    phi3v_model_config,
-    phi3v_tokenizer,
    image_url,
 ):
    image_uuid = str(hash(image_url))
@ -354,7 +328,6 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format(
            }
        ],
        phi3v_model_config,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -367,7 +340,6 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format(

 def test_parse_chat_messages_multiple_images_with_uuids(
    phi3v_model_config,
-    phi3v_tokenizer,
    image_url,
 ):
    image_uuid1 = "my_uuid_1"
@ -397,7 +369,6 @@ def test_parse_chat_messages_multiple_images_with_uuids(
            }
        ],
        phi3v_model_config,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -413,7 +384,6 @@ def test_parse_chat_messages_multiple_images_with_uuids(

 def test_parse_chat_messages_multiple_empty_images_with_uuids(
    phi3v_model_config,
-    phi3v_tokenizer,
    image_url,
 ):
    image_uuid1 = "my_uuid_1"
@ -439,7 +409,6 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids(
            }
        ],
        phi3v_model_config,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -455,7 +424,6 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids(

 def test_parse_chat_messages_mixed_empty_images_with_uuids(
    phi3v_model_config,
-    phi3v_tokenizer,
    image_url,
 ):
    image_uuid1 = "my_uuid_1"
@ -483,7 +451,6 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids(
            }
        ],
        phi3v_model_config,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -500,7 +467,6 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids(
@pytest.mark.asyncio
 async def test_parse_chat_messages_single_image_with_uuid_async(
    phi3v_model_config,
-    phi3v_tokenizer,
    image_url,
 ):
    image_uuid = str(hash(image_url))
@ -519,7 +485,6 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
            }
        ],
        phi3v_model_config,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -533,7 +498,6 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
@pytest.mark.asyncio
 async def test_parse_chat_messages_empty_image_with_uuid_async(
    phi3v_model_config,
-    phi3v_tokenizer,
    image_url,
 ):
    image_uuid = str(hash(image_url))
@ -552,7 +516,6 @@ async def test_parse_chat_messages_empty_image_with_uuid_async(
            }
        ],
        phi3v_model_config,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -566,7 +529,6 @@ async def test_parse_chat_messages_empty_image_with_uuid_async(
@pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_with_uuids_async(
    phi3v_model_config,
-    phi3v_tokenizer,
    image_url,
 ):
    image_uuid1 = "my_uuid_1"
@ -592,7 +554,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
            }
        ],
        phi3v_model_config,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -609,7 +570,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
@pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
    phi3v_model_config,
-    phi3v_tokenizer,
    image_url,
 ):
    image_uuid1 = "my_uuid_1"
@ -635,7 +595,6 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
            }
        ],
        phi3v_model_config,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -652,7 +611,6 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
@pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
    phi3v_model_config,
-    phi3v_tokenizer,
    image_url,
 ):
    image_uuid2 = "my_uuid_2"
@ -676,7 +634,6 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
            }
        ],
        phi3v_model_config,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -692,7 +649,6 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(

 def test_parse_chat_messages_empty_system(
    mistral_model_config,
-    mistral_tokenizer,
 ):
    # Test string format
    conversation, _, _ = parse_chat_messages(
@ -704,7 +660,6 @@ def test_parse_chat_messages_empty_system(
            },
        ],
        mistral_model_config,
-        mistral_tokenizer,
        content_format="string",
    )
    assert conversation == [
@ -722,7 +677,6 @@ def test_parse_chat_messages_empty_system(
            },
        ],
        mistral_model_config,
-        mistral_tokenizer,
        content_format="openai",
    )
    assert conversation == [
@ -734,7 +688,6 @@ def test_parse_chat_messages_empty_system(
@pytest.mark.asyncio
 async def test_parse_chat_messages_single_image_async(
    phi3v_model_config,
-    phi3v_tokenizer,
    image_url,
 ):
    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
@ -748,7 +701,6 @@ async def test_parse_chat_messages_single_image_async(
            }
        ],
        phi3v_model_config,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -761,7 +713,6 @@ async def test_parse_chat_messages_single_image_async(

 def test_parse_chat_messages_multiple_images(
    phi3v_model_config,
-    phi3v_tokenizer,
    image_url,
 ):
    conversation, mm_data, mm_uuids = parse_chat_messages(
@ -779,7 +730,6 @@ def test_parse_chat_messages_multiple_images(
            }
        ],
        phi3v_model_config,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -795,7 +745,6 @@ def test_parse_chat_messages_multiple_images(

 def test_parse_chat_messages_empty_pil_image_with_uuid(
    phi3v_model_config,
-    phi3v_tokenizer,
 ):
    uuid = "abcd"
    conversation, mm_data, mm_uuids = parse_chat_messages(
@ -809,7 +758,6 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(
            }
        ],
        phi3v_model_config,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -825,7 +773,6 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(

 def test_parse_chat_messages_empty_image_embeds_with_uuid(
    phi3v_model_config_image_embeds,
-    phi3v_tokenizer,
 ):
    uuid = "abcd"
    conversation, mm_data, mm_uuids = parse_chat_messages(
@ -839,7 +786,6 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
            }
        ],
        phi3v_model_config_image_embeds,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -857,7 +803,6 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(

 def test_parse_chat_messages_empty_audio_embeds_with_uuid(
    audio_embeds_model_config,
-    qwen2_audio_tokenizer,
 ):
    """Test audio_embeds with UUID (no actual embeds data)."""
    uuid = "test-audio-uuid-123"
@ -873,7 +818,6 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
            }
        ],
        audio_embeds_model_config,
-        qwen2_audio_tokenizer,
        content_format="string",
    )

@ -889,11 +833,8 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(

 def test_parse_chat_messages_audio_embeds_with_string(
    audio_embeds_model_config,
-    qwen2_audio_tokenizer,
 ):
    """Test audio_embeds with base64 string embedding data."""
-    import base64
-    import io

    import torch

@ -901,11 +842,7 @@ def test_parse_chat_messages_audio_embeds_with_string(
    audio_embedding = torch.randn(1, 128, 768)

    # Encode it as base64
-    buffer = io.BytesIO()
-    torch.save(audio_embedding, buffer)
-    buffer.seek(0)
-    binary_data = buffer.read()
-    base64_audio_embedding = base64.b64encode(binary_data).decode("utf-8")
+    base64_audio_embedding = tensor2base64(audio_embedding)

    conversation, mm_data, mm_uuids = parse_chat_messages(
        [
@ -921,7 +858,6 @@ def test_parse_chat_messages_audio_embeds_with_string(
            }
        ],
        audio_embeds_model_config,
-        qwen2_audio_tokenizer,
        content_format="string",
    )

@ -939,11 +875,8 @@ def test_parse_chat_messages_audio_embeds_with_string(
@pytest.mark.asyncio
 async def test_parse_chat_messages_audio_embeds_async(
    audio_embeds_model_config,
-    qwen2_audio_tokenizer,
 ):
    """Test audio_embeds with async futures."""
-    import base64
-    import io

    import torch

@ -951,11 +884,7 @@ async def test_parse_chat_messages_audio_embeds_async(
    audio_embedding = torch.randn(1, 128, 768)

    # Encode it as base64
-    buffer = io.BytesIO()
-    torch.save(audio_embedding, buffer)
-    buffer.seek(0)
-    binary_data = buffer.read()
-    base64_audio_embedding = base64.b64encode(binary_data).decode("utf-8")
+    base64_audio_embedding = tensor2base64(audio_embedding)

    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
        [
@ -971,7 +900,6 @@ async def test_parse_chat_messages_audio_embeds_async(
            }
        ],
        audio_embeds_model_config,
-        qwen2_audio_tokenizer,
        content_format="string",
    )

@ -990,7 +918,6 @@ async def test_parse_chat_messages_audio_embeds_async(
@pytest.mark.asyncio
 async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
    phi3v_model_config_image_embeds,
-    phi3v_tokenizer,
 ):
    uuid = "abcd"
    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
@ -1004,7 +931,6 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
            }
        ],
        phi3v_model_config_image_embeds,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -1024,7 +950,6 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
@pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_async(
    phi3v_model_config,
-    phi3v_tokenizer,
    image_url,
 ):
    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
@ -1042,7 +967,6 @@ async def test_parse_chat_messages_multiple_images_async(
            }
        ],
        phi3v_model_config,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -1058,7 +982,6 @@ async def test_parse_chat_messages_multiple_images_async(

 def test_parse_chat_messages_placeholder_already_in_prompt(
    phi3v_model_config,
-    phi3v_tokenizer,
    image_url,
 ):
    conversation, mm_data, mm_uuids = parse_chat_messages(
@ -1076,7 +999,6 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
            }
        ],
        phi3v_model_config,
-        phi3v_tokenizer,
        content_format="string",
    )
    assert conversation == [
@ -1091,7 +1013,6 @@ def test_parse_chat_messages_placeholder_already_in_prompt(

 def test_parse_chat_messages_placeholder_one_already_in_prompt(
    phi3v_model_config,
-    phi3v_tokenizer,
    image_url,
 ):
    conversation, mm_data, mm_uuids = parse_chat_messages(
@ -1110,7 +1031,6 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
            }
        ],
        phi3v_model_config,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -1127,7 +1047,6 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(

 def test_parse_chat_messages_multiple_images_across_messages(
    phi3v_model_config,
-    phi3v_tokenizer,
    image_url,
 ):
    conversation, mm_data, mm_uuids = parse_chat_messages(
@ -1149,7 +1068,6 @@ def test_parse_chat_messages_multiple_images_across_messages(
            },
        ],
        phi3v_model_config,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -1164,7 +1082,6 @@ def test_parse_chat_messages_multiple_images_across_messages(

 def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
    phi3v_model_config,
-    phi3v_tokenizer,
    image_url,
 ):
    image_uuid = str(hash(image_url))
@ -1195,7 +1112,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
            },
        ],
        phi3v_model_config,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -1210,7 +1126,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages(

 def test_parse_chat_messages_context_text_format(
    phi3v_model_config,
-    phi3v_tokenizer,
 ):
    conversation, mm_data, mm_uuids = parse_chat_messages(
        [
@ -1222,7 +1137,6 @@ def test_parse_chat_messages_context_text_format(
            {"role": "user", "content": "What about this one?"},
        ],
        phi3v_model_config,
-        phi3v_tokenizer,
        content_format="openai",
    )

@ -1246,7 +1160,6 @@ def test_parse_chat_messages_context_text_format(

 def test_parse_chat_messages_rejects_too_many_images_in_one_message(
    phi3v_model_config,
-    phi3v_tokenizer,
    image_url,
 ):
    with warnings.catch_warnings():
@ -1277,14 +1190,12 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message(
                    }
                ],
                phi3v_model_config,
-                phi3v_tokenizer,
                content_format="string",
            )


 def test_parse_chat_messages_rejects_too_many_images_across_messages(
    phi3v_model_config,
-    phi3v_tokenizer,
    image_url,
 ):
    with warnings.catch_warnings():
@ -1322,14 +1233,12 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages(
                    },
                ],
                phi3v_model_config,
-                phi3v_tokenizer,
                content_format="string",
            )


 def test_parse_chat_messages_multiple_images_uncommon_input(
    phi3v_model_config,
-    phi3v_tokenizer,
    image_url,
 ):
    conversation, mm_data, mm_uuids = parse_chat_messages(
@ -1344,7 +1253,6 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
            }
        ],
        phi3v_model_config,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -1360,7 +1268,6 @@ def test_parse_chat_messages_multiple_images_uncommon_input(

 def test_parse_chat_messages_multiple_images_interleave(
    phi3v_model_config_mm_interleaved,
-    phi3v_tokenizer,
    image_url,
 ):
    conversation, mm_data, mm_uuids = parse_chat_messages(
@ -1380,7 +1287,6 @@ def test_parse_chat_messages_multiple_images_interleave(
            }
        ],
        phi3v_model_config_mm_interleaved,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -1398,7 +1304,6 @@ def test_parse_chat_messages_multiple_images_interleave(
@pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_interleave_async(
    phi3v_model_config_mm_interleaved,
-    phi3v_tokenizer,
    image_url,
 ):
    conversation, mm_data, mm_uuids = parse_chat_messages_futures(
@ -1418,7 +1323,6 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
            }
        ],
        phi3v_model_config_mm_interleaved,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -1436,7 +1340,6 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
@pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
    phi3v_model_config_mm_interleaved,
-    phi3v_tokenizer,
    image_url,
 ):
    image_uuid = str(hash(image_url))
@ -1465,7 +1368,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
            }
        ],
        phi3v_model_config_mm_interleaved,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -1482,7 +1384,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(

 def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
    phi3v_model_config_mm_interleaved,
-    phi3v_tokenizer,
    image_url,
 ):
    conversation, mm_data, mm_uuids = parse_chat_messages(
@ -1505,7 +1406,6 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
            },
        ],
        phi3v_model_config_mm_interleaved,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -1523,7 +1423,6 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(

 def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave(
    phi3v_model_config_mm_interleaved,
-    phi3v_tokenizer,
    image_url,
 ):
    image_uuid = str(hash(image_url))
@ -1555,7 +1454,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl
            },
        ],
        phi3v_model_config_mm_interleaved,
-        phi3v_tokenizer,
        content_format="string",
    )

@ -1573,7 +1471,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl

 def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
    qwen25omni_model_config_mm_interleaved,
-    qwen25omni_tokenizer,
    image_url,
    video_url,
    audio_url,
@ -1601,7 +1498,6 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
            },
        ],
        qwen25omni_model_config_mm_interleaved,
-        qwen25omni_tokenizer,
        content_format="string",
    )

@ -1627,7 +1523,6 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(

 def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave(
    qwen25omni_model_config_mm_interleaved,
-    qwen25omni_tokenizer,
    image_url,
    video_url,
    audio_url,
@ -1671,7 +1566,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
            },
        ],
        qwen25omni_model_config_mm_interleaved,
-        qwen25omni_tokenizer,
        content_format="string",
    )

@ -1699,7 +1593,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl

 def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_messages_interleave(  # noqa: E501
    qwen25omni_model_config_mm_interleaved,
-    qwen25omni_tokenizer,
    image_url,
    video_url,
    audio_url,
@ -1743,7 +1636,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
            },
        ],
        qwen25omni_model_config_mm_interleaved,
-        qwen25omni_tokenizer,
        content_format="string",
    )

@ -1775,7 +1667,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes

 def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave(  # noqa: E501
    qwen25omni_model_config_mm_interleaved,
-    qwen25omni_tokenizer,
    image_url,
    video_url,
    audio_url,
@ -1811,7 +1702,6 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
            },
        ],
        qwen25omni_model_config_mm_interleaved,
-        qwen25omni_tokenizer,
        content_format="string",
    )

@ -1837,7 +1727,6 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message

 def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
    phi3v_model_config_mm_interleaved,
-    phi3v_tokenizer,
    image_url,
 ):
    with pytest.raises(
@ -1861,7 +1750,6 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
                }
            ],
            phi3v_model_config_mm_interleaved,
-            phi3v_tokenizer,
            content_format="string",
        )

@ -2237,9 +2125,7 @@ def test_resolve_content_format_examples(template_path, expected_format):
    assert resolved_format == expected_format


-def test_parse_chat_messages_include_thinking_chunk(
-    mistral_model_config, mistral_tokenizer
-):
+def test_parse_chat_messages_include_thinking_chunk(mistral_model_config):
    messages = [
        {
            "role": "system",
@ -2269,7 +2155,6 @@ def test_parse_chat_messages_include_thinking_chunk(
    conversation_with_thinking, _, _ = parse_chat_messages(
        messages,
        mistral_model_config,
-        mistral_tokenizer,
        content_format="openai",
    )

@ -2353,7 +2238,6 @@ def test_apply_mistral_chat_template_thinking_chunk():

 def test_parse_chat_messages_single_empty_audio_with_uuid(
    qwen2_audio_model_config,
-    qwen2_audio_tokenizer,
 ):
    audio_uuid = "abcd"
    conversation, mm_data, mm_uuids = parse_chat_messages(
@ -2371,7 +2255,6 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
            }
        ],
        qwen2_audio_model_config,
-        qwen2_audio_tokenizer,
        content_format="string",
    )

@ -2389,7 +2272,6 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
@pytest.mark.asyncio
 async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
    qwen2_audio_model_config,
-    qwen2_audio_tokenizer,
 ):
    audio_uuid = "abcd"
    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
@ -2407,7 +2289,6 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
            }
        ],
        qwen2_audio_model_config,
-        qwen2_audio_tokenizer,
        content_format="string",
    )

--- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py
+++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
@ -13,9 +13,6 @@ from vllm.model_executor.layers.fused_moe.all2all_utils import (
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
    BatchedDeepGemmExperts,
 )
-from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (
-    BatchedTritonOrDeepGemmExperts,
-)
 from vllm.model_executor.layers.fused_moe.config import (
    FusedMoEConfig,
    FusedMoEQuantConfig,
@ -286,16 +283,6 @@ if has_deep_gemm() and is_deep_gemm_supported():
        needs_matching_quant=False,
        needs_deep_gemm=True,
    )
-    register_experts(
-        BatchedTritonOrDeepGemmExperts,
-        batched_format,
-        common_float_and_int_types,
-        blocked_quantization_support=True,
-        supports_chunking=False,
-        supports_expert_map=False,
-        needs_matching_quant=True,
-        needs_deep_gemm=True,
-    )
    register_experts(
        TritonOrDeepGemmExperts,
        standard_format,
@ -457,10 +444,6 @@ def make_fused_experts(
        kwargs = batch_kwargs | quant_kwargs
        print(f"Making BatchedTritonExperts {kwargs} ...")
        experts = BatchedTritonExperts(**kwargs)
-    elif fused_experts_type == BatchedTritonOrDeepGemmExperts:
-        kwargs = batch_kwargs | quant_kwargs | deepgemm_kwargs
-        print(f"Making BatchedTritonOrDeepGemmExperts {kwargs} ...")
-        experts = BatchedTritonOrDeepGemmExperts(**kwargs)
    elif fused_experts_type == DeepGemmExperts:
        print(f"Making DeepGemmExperts {quant_config} ...")
        experts = DeepGemmExperts(quant_config)
--- a/tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py
+++ b/tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py
@ -0,0 +1,86 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    _per_token_group_quant_fp8_colmajor,
+    silu_mul_per_token_group_quant_fp8_colmajor,
+)
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+
+FLOAT8_DTYPE = torch.float8_e4m3fn
+GROUP_SIZE = 128
+
+
+def reference_quant(x: torch.Tensor, use_ue8m0: bool):
+    """
+    Reference triton quant kernel from,
+    vllm.model_executor.layers.quantization.utils.fp8_utils
+    """
+
+    x_q = torch.empty_like(x, device=x.device, dtype=FLOAT8_DTYPE)
+
+    # Allocate the scale tensor in column-major format.
+    shape = (x.shape[-1] // GROUP_SIZE,) + x.shape[:-1]
+    x_s = torch.empty(shape, device=x.device, dtype=torch.float32).permute(-1, -2)
+
+    M = x.numel() // GROUP_SIZE
+    N = GROUP_SIZE
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+    num_stages = 1
+
+    finfo = torch.finfo(FLOAT8_DTYPE)
+    fp8_min = finfo.min
+    fp8_max = finfo.max
+
+    _per_token_group_quant_fp8_colmajor[(M,)](
+        x,
+        x_q,
+        x_s,
+        GROUP_SIZE,
+        x.shape[1],
+        x.stride(0),
+        x_s.stride(1),
+        eps=1e-10,
+        fp8_min=fp8_min,
+        fp8_max=fp8_max,
+        use_ue8m0=use_ue8m0,
+        BLOCK=BLOCK,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    return x_q, x_s
+
+
+def reference(x: torch.Tensor, use_ue8m0: bool) -> tuple[torch.Tensor, torch.Tensor]:
+    T, N = x.size()
+    ref_act_out = torch.empty((T, N // 2), dtype=torch.bfloat16, device="cuda")
+    torch.ops._C.silu_and_mul(ref_act_out, x)
+    return reference_quant(ref_act_out, use_ue8m0)
+
+
+@pytest.mark.parametrize("T", [128, 256, 512])
+@pytest.mark.parametrize("N", [128 * 2, 256 * 2, 768 * 2, 2048 * 2, 7168 * 2])
+def test_silu_mul_fp8_quant_deep_gemm(T: int, N: int):
+    current_platform.seed_everything(42)
+
+    input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
+
+    use_ue8m0 = is_deep_gemm_e8m0_used()
+
+    # Test
+    output, output_scales = silu_mul_per_token_group_quant_fp8_colmajor(
+        input, use_ue8m0=use_ue8m0
+    )
+
+    # Reference
+    ref_output, ref_output_scales = reference(input, use_ue8m0)
+
+    torch.testing.assert_close(output.to(torch.float32), ref_output.to(torch.float32))
+    torch.testing.assert_close(output_scales, ref_output_scales)
--- a/tests/model_executor/model_loader/runai_streamer_loader/init.py
+++ b/tests/model_executor/model_loader/runai_streamer_loader/init.py
--- a/tests/model_executor/model_loader/runai_streamer_loader/conftest.py
+++ b/tests/model_executor/model_loader/runai_streamer_loader/conftest.py
@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.utils.network_utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.v1.executor import UniProcExecutor
+from vllm.v1.worker.worker_base import WorkerWrapperBase
+
+
+# This is a dummy executor for patching in test_runai_model_streamer_s3.py.
+# We cannot use vllm_runner fixture here, because it spawns worker process.
+# The worker process reimports the patched entities, and the patch is not applied.
+class RunaiDummyExecutor(UniProcExecutor):
+    def _init_executor(self) -> None:
+        distributed_init_method = get_distributed_init_method(get_ip(), get_open_port())
+
+        local_rank = 0
+        rank = 0
+        is_driver_worker = True
+
+        device_info = self.vllm_config.device_config.device.__str__().split(":")
+        if len(device_info) > 1:
+            local_rank = int(device_info[1])
+
+        worker_rpc_kwargs = dict(
+            vllm_config=self.vllm_config,
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            is_driver_worker=is_driver_worker,
+        )
+
+        wrapper_kwargs = {
+            "vllm_config": self.vllm_config,
+        }
+
+        self.driver_worker = WorkerWrapperBase(**wrapper_kwargs)
+
+        self.collective_rpc("init_worker", args=([worker_rpc_kwargs],))
+        self.collective_rpc("init_device")
--- a/tests/model_executor/model_loader/runai_streamer_loader/test_runai_model_streamer_loader.py
+++ b/tests/model_executor/model_loader/runai_streamer_loader/test_runai_model_streamer_loader.py
--- a/tests/model_executor/model_loader/runai_streamer_loader/test_runai_model_streamer_s3.py
+++ b/tests/model_executor/model_loader/runai_streamer_loader/test_runai_model_streamer_s3.py
@ -0,0 +1,52 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from pathlib import Path
+
+from huggingface_hub import snapshot_download
+from runai_model_streamer.safetensors_streamer.streamer_mock import StreamerPatcher
+
+from vllm.engine.arg_utils import EngineArgs
+
+from .conftest import RunaiDummyExecutor
+
+load_format = "runai_streamer"
+test_model = "openai-community/gpt2"
+
+
+def test_runai_model_loader_download_files_s3_mocked_with_patch(
+    vllm_runner,
+    tmp_path: Path,
+    monkeypatch,
+):
+    patcher = StreamerPatcher(str(tmp_path))
+
+    test_mock_s3_model = "s3://my-mock-bucket/gpt2/"
+
+    # Download model from HF
+    mock_model_dir = f"{tmp_path}/gpt2"
+    snapshot_download(repo_id=test_model, local_dir=mock_model_dir)
+
+    monkeypatch.setattr(
+        "vllm.transformers_utils.runai_utils.runai_list_safetensors",
+        patcher.shim_list_safetensors,
+    )
+    monkeypatch.setattr(
+        "vllm.transformers_utils.runai_utils.runai_pull_files",
+        patcher.shim_pull_files,
+    )
+    monkeypatch.setattr(
+        "vllm.model_executor.model_loader.weight_utils.SafetensorsStreamer",
+        patcher.create_mock_streamer,
+    )
+
+    engine_args = EngineArgs(
+        model=test_mock_s3_model,
+        load_format=load_format,
+        tensor_parallel_size=1,
+    )
+
+    vllm_config = engine_args.create_engine_config()
+
+    executor = RunaiDummyExecutor(vllm_config)
+    executor.driver_worker.load_model()
--- a/tests/model_executor/model_loader/runai_streamer_loader/test_runai_utils.py
+++ b/tests/model_executor/model_loader/runai_streamer_loader/test_runai_utils.py
--- a/tests/model_executor/model_loader/runai_streamer_loader/test_weight_utils.py
+++ b/tests/model_executor/model_loader/runai_streamer_loader/test_weight_utils.py
--- a/tests/models/language/pooling/test_all_pooling_plus_chunked_prefill.py
+++ b/tests/models/language/pooling/test_all_pooling_plus_chunked_prefill.py
@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from transformers import AutoModel
+
+from tests.models.utils import check_embeddings_close
+from vllm import TokensPrompt
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["Qwen/Qwen3-Embedding-0.6B"],
+)
+@torch.inference_mode
+def test_embed_models(hf_runner, vllm_runner, model: str):
+    chunk_size = 10
+    n_prompt_tokens = [55, 56, 57]
+    token_prompts = [[1024 + i for i in range(n)] for n in n_prompt_tokens]
+
+    with vllm_runner(
+        model,
+        runner="pooling",
+        max_model_len=128,
+        max_num_batched_tokens=chunk_size,
+        enforce_eager=True,
+        # `enable_chunked_prefill`: Set to `False` instead of `None` in VllmRunner
+        enable_chunked_prefill=True,
+        enable_prefix_caching=True,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.token_embed(
+            [TokensPrompt(prompt_token_ids=t) for t in token_prompts],
+        )
+
+    with hf_runner(
+        model,
+        auto_cls=AutoModel,
+    ) as hf_model:
+        hf_outputs = []
+        for token_prompt in token_prompts:
+            inputs = hf_model.wrap_device({"input_ids": torch.tensor([token_prompt])})
+            input_ids = inputs["input_ids"]
+            output = hf_model.model(input_ids)
+            hf_outputs.append(output.last_hidden_state.cpu().float()[0])
+
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        check_embeddings_close(
+            embeddings_0_lst=hf_output,
+            embeddings_1_lst=vllm_output,
+            name_0="hf",
+            name_1="vllm",
+            tol=1e-2,
+        )
--- a/tests/models/language/pooling/test_extract_hidden_states.py
+++ b/tests/models/language/pooling/test_extract_hidden_states.py
@ -20,7 +20,6 @@ def test_extract_hidden_states(hf_runner, vllm_runner, model: str):
        max_model_len=128,
        enforce_eager=True,
        runner="pooling",
-        enable_chunked_prefill=False,
        enable_prefix_caching=True,
    ) as vllm_model:
        pooling_outputs = vllm_model.llm.encode(
--- a/tests/models/multimodal/generation/conftest.py
+++ b/tests/models/multimodal/generation/conftest.py
@ -2,6 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Pytest configuration for vLLM tests."""

+import warnings
+
 import torch

 from vllm.platforms import current_platform
@ -14,6 +16,20 @@ def pytest_configure(config):
    if not current_platform.is_rocm():
        return

+    skip_patterns = ["test_granite_speech.py"]
+    if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns):
+        # Skip disabling SDP for Granite Speech tests on ROCm
+        return
+
+    # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
+    # accuracy issues
+    # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
    torch.backends.cuda.enable_flash_sdp(False)
    torch.backends.cuda.enable_mem_efficient_sdp(False)
    torch.backends.cuda.enable_math_sdp(True)
+    warnings.warn(
+        "ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
+        "to avoid HuggingFace Transformers accuracy issues",
+        UserWarning,
+        stacklevel=1,
+    )
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@ -137,7 +137,7 @@ VLM_TEST_SETTINGS = {
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
-        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
    "qwen2_5_omni": VLMTestInfo(
@ -152,7 +152,7 @@ VLM_TEST_SETTINGS = {
        auto_cls=AutoModelForTextToWaveform,
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
        patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner,
-        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
    "qwen3_vl": VLMTestInfo(
@ -173,7 +173,7 @@ VLM_TEST_SETTINGS = {
        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
        patch_hf_runner=model_utils.qwen3_vl_patch_hf_runner,
-        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
        marks=[
            pytest.mark.core_model,
        ],
@ -350,7 +350,7 @@ VLM_TEST_SETTINGS = {
        patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
        hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
        stop_str=["<｜end▁of▁sentence｜>", "<｜begin▁of▁sentence｜>"],
-        image_size_factors=[(), (1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
+        image_size_factors=[(1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
    ),
    "fuyu": VLMTestInfo(
        models=["adept/fuyu-8b"],
@ -403,12 +403,13 @@ VLM_TEST_SETTINGS = {
        # So, we need to reduce the number of tokens for the test to pass.
        max_tokens=8,
        num_logprobs=10,
+        auto_cls=AutoModelForCausalLM,
        marks=[large_gpu_mark(min_gb=32)],
    ),
    "glm4_1v": VLMTestInfo(
        models=["zai-org/GLM-4.1V-9B-Thinking"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",
+        prompt_formatter=lambda img_prompt: f"[gMASK]<|user|>\n{img_prompt}<|assistant|>\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>",
        video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>",
        max_model_len=2048,
@ -423,6 +424,7 @@ VLM_TEST_SETTINGS = {
        models=["zai-org/GLM-4.1V-9B-Thinking"],
        # GLM4.1V require include video metadata for input
        test_type=VLMTestType.CUSTOM_INPUTS,
+        prompt_formatter=lambda vid_prompt: f"[gMASK]<|user|>\n{vid_prompt}<|assistant|>\n",  # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
@ -707,7 +709,7 @@ VLM_TEST_SETTINGS = {
        max_model_len=8192,
        max_num_seqs=2,
        auto_cls=AutoModelForCausalLM,
-        image_size_factors=[(), (0.25,)],
+        image_size_factors=[(0.25,)],
        marks=[
            pytest.mark.skipif(
                Version(TRANSFORMERS_VERSION) == Version("4.57.3"),
@ -737,7 +739,13 @@ VLM_TEST_SETTINGS = {
        max_model_len=8192,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
-        marks=[large_gpu_mark(min_gb=48)],
+        marks=[
+            large_gpu_mark(min_gb=48),
+            pytest.mark.skipif(
+                current_platform.is_rocm(),
+                reason="Model produces a vector of <UNK> output in HF on ROCm",
+            ),
+        ],
    ),
    "qwen_vl": VLMTestInfo(
        models=["Qwen/Qwen-VL"],
@ -760,7 +768,7 @@ VLM_TEST_SETTINGS = {
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
-        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
        marks=[pytest.mark.cpu_model],
    ),
    "skywork_r1v": VLMTestInfo(
@ -812,7 +820,7 @@ VLM_TEST_SETTINGS = {
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
-        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
        marks=[pytest.mark.skip("Model initialization hangs")],
    ),
    ### Tensor parallel / multi-gpu broadcast tests
--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@ -8,6 +8,7 @@ from transformers import AutoModelForSpeechSeq2Seq

 from vllm.logprobs import SampleLogprobs
 from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform

 from ....conftest import AudioTestAssets, HfRunner, PromptAudioInput, VllmRunner
 from ...registry import HF_EXAMPLE_MODELS
@ -34,6 +35,12 @@ audio_lora_path = MODEL_NAME
 models = [MODEL_NAME]


+@pytest.fixture(autouse=True)
+def set_attention_backend_for_rocm(monkeypatch):
+    if current_platform.is_rocm():
+        monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
+
+
 def run_test(
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
@ -111,8 +118,12 @@ def run_test(


@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_model_len", [2048])
+@pytest.mark.parametrize(
+    "dtype", ["float16"] if current_platform.is_rocm() else ["bfloat16"]
+)
+@pytest.mark.parametrize(
+    "max_model_len", [512] if current_platform.is_rocm() else [2048]
+)
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
 def test_models(
--- a/tests/models/multimodal/generation/test_phi4_multimodal.py
+++ b/tests/models/multimodal/generation/test_phi4_multimodal.py
@ -1,281 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-from collections.abc import Sequence
-
-import librosa
-import pytest
-from huggingface_hub import snapshot_download
-
-from vllm.assets.image import ImageAsset
-from vllm.lora.request import LoRARequest
-from vllm.multimodal.image import rescale_image_size
-
-from ....conftest import (
-    IMAGE_ASSETS,
-    HfRunner,
-    PromptAudioInput,
-    PromptImageInput,
-    VllmRunner,
-)
-from ....utils import large_gpu_test
-from ...utils import check_logprobs_close
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
-    {
-        "stop_sign": "<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
-        "cherry_blossom": "<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
-    }
-)
-HF_MULTIIMAGE_IMAGE_PROMPT = (
-    "<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
-)
-
-model_path = snapshot_download(
-    "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
-)
-# Since the vision-lora and speech-lora co-exist with the base model,
-# we have to manually specify the path of the lora weights.
-vision_lora_path = os.path.join(model_path, "vision-lora")
-speech_question = os.path.join(
-    model_path, "examples", "what_is_shown_in_this_image.wav"
-)
-models = [model_path]
-
-target_dtype = "half"
-
-
-def run_test(
-    hf_runner: type[HfRunner],
-    vllm_runner: type[VllmRunner],
-    inputs: Sequence[tuple[list[str], PromptImageInput, PromptAudioInput | None]],
-    model: str,
-    *,
-    max_model_len: int,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    mm_limit: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: str | None = None,
-):
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test are from IMAGE_ASSETS.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects
-    and corresponding MultiModalConfig as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(
-        model,
-        task="generate",
-        max_model_len=max_model_len,
-        max_num_seqs=2,
-        dtype=dtype,
-        limit_mm_per_prompt={"image": mm_limit},
-        tensor_parallel_size=tensor_parallel_size,
-        distributed_executor_backend=distributed_executor_backend,
-        enable_lora=True,
-        max_lora_rank=320,
-        gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
-        enforce_eager=True,
-        trust_remote_code=False,
-    ) as vllm_model:
-        lora_request = LoRARequest("vision", 1, vision_lora_path)
-        vllm_outputs_per_case = [
-            vllm_model.generate_greedy_logprobs(
-                prompts,
-                max_tokens,
-                num_logprobs=num_logprobs,
-                images=images,
-                audios=audios,
-                lora_request=lora_request,
-            )
-            for prompts, images, audios in inputs
-        ]
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_model.model.load_adapter(
-            vision_lora_path,
-            adapter_name="vision",
-        )
-        hf_processor = hf_model.processor
-        eos_token_id = hf_processor.tokenizer.eos_token_id
-        hf_outputs_per_case = [
-            hf_model.generate_greedy_logprobs_limit(
-                prompts,
-                max_tokens,
-                num_logprobs=num_logprobs,
-                images=images,
-                audios=audios,
-                eos_token_id=eos_token_id,
-            )
-            for prompts, images, audios in inputs
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [12800])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [10])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    image_assets,
-    model,
-    size_factors,
-    dtype: str,
-    max_model_len: int,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_image = [
-        (
-            [prompt for _ in size_factors],
-            [rescale_image_size(image, factor) for factor in size_factors],
-            None,
-        )
-        for image, prompt in zip(images, HF_IMAGE_PROMPTS)
-    ]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_per_image,
-        model,
-        dtype=dtype,
-        max_model_len=max_model_len,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=1,
-        tensor_parallel_size=1,
-    )
-
-
-@large_gpu_test(min_gb=48)
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        # [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [25600])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [10])
-def test_multi_images_models(
-    hf_runner,
-    vllm_runner,
-    image_assets,
-    model,
-    size_factors,
-    dtype: str,
-    max_model_len: int,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_case = [
-        (
-            [HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
-            [
-                [rescale_image_size(image, factor) for image in images]
-                for factor in size_factors
-            ],
-            None,
-        ),
-    ]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_per_case,
-        model,
-        dtype=dtype,
-        max_model_len=max_model_len,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=2,
-        tensor_parallel_size=1,
-    )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [12800])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [10])
-def test_vision_speech_models(
-    hf_runner,
-    vllm_runner,
-    model,
-    dtype: str,
-    max_model_len: int,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-    # use the example speech question so that the model outputs are reasonable
-    audio = librosa.load(speech_question, sr=16000)
-    image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
-
-    inputs_vision_speech = [
-        (
-            ["<|user|><|image|><|audio|><|end|><|assistant|>"],
-            [image],
-            [audio],
-        ),
-    ]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_vision_speech,
-        model,
-        dtype=dtype,
-        max_model_len=max_model_len,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=1,
-        tensor_parallel_size=1,
-    )
--- a/tests/models/multimodal/generation/test_pixtral.py
+++ b/tests/models/multimodal/generation/test_pixtral.py
@ -15,6 +15,7 @@ from transformers import AutoProcessor
 from vllm import SamplingParams, TextPrompt, TokensPrompt
 from vllm.logprobs import Logprob, SampleLogprobs
 from vllm.multimodal import MultiModalDataBuiltins
+from vllm.platforms import current_platform

 from ....utils import VLLM_PATH, large_gpu_test
 from ...utils import check_logprobs_close
@ -165,6 +166,15 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
 def test_chat(
    vllm_runner, max_model_len: int, model: str, dtype: str, local_asset_server
 ) -> None:
+    if (
+        model == MISTRAL_SMALL_3_1_ID
+        and max_model_len == 65536
+        and current_platform.is_rocm()
+    ):
+        pytest.skip(
+            "OOM on ROCm: 24B model with 65536 context length exceeds GPU memory"
+        )
+
    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT[model])
    with vllm_runner(
        model,
--- a/tests/models/multimodal/generation/vlm_utils/case_filtering.py
+++ b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
@ -62,6 +62,65 @@ def get_filtered_test_settings(
    return matching_tests


+def get_model_type_cases(
+    model_type: str,
+    test_info: VLMTestInfo,
+    test_type: VLMTestType,
+):
+    # Ensure that something is wrapped as an iterable it's not already
+    ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)
+
+    # This is essentially the same as nesting a bunch of mark.parametrize
+    # decorators, but we do it programmatically to allow overrides for on
+    # a per-model basis, while still being able to execute each of these
+    # as individual test cases in pytest.
+    iter_kwargs = OrderedDict(
+        [
+            ("model", ensure_wrapped(test_info.models)),
+            ("max_tokens", ensure_wrapped(test_info.max_tokens)),
+            ("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
+            ("dtype", ensure_wrapped(test_info.dtype)),
+            (
+                "distributed_executor_backend",
+                ensure_wrapped(test_info.distributed_executor_backend),
+            ),
+        ]
+    )
+
+    # num_frames is video only
+    if test_type == VLMTestType.VIDEO:
+        iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)
+        iter_kwargs["needs_video_metadata"] = ensure_wrapped(
+            test_info.needs_video_metadata
+        )
+
+    # No sizes passed for custom inputs, since inputs are directly provided
+    if test_type not in (
+        VLMTestType.CUSTOM_INPUTS,
+        VLMTestType.AUDIO,
+    ):
+        wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
+        if wrapped_sizes is None:
+            raise ValueError(f"Sizes must be set for test type {test_type}")
+        iter_kwargs["size_wrapper"] = wrapped_sizes
+
+    # Otherwise expand the custom test options instead
+    elif test_type == VLMTestType.CUSTOM_INPUTS:
+        if test_info.custom_test_opts is None:
+            raise ValueError("Test has type CUSTOM_INPUTS, but none given")
+        iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
+
+    # Wrap all model cases in a pytest parameter & pass marks through
+    return [
+        pytest.param(
+            model_type,
+            ExpandableVLMTestArgs(**{k: v for k, v in zip(iter_kwargs.keys(), case)}),
+            marks=test_info.marks if test_info.marks is not None else [],
+        )
+        for case in list(itertools.product(*iter_kwargs.values()))
+    ]
+
+
 def get_parametrized_options(
    test_settings: dict[str, VLMTestInfo],
    test_type: VLMTestType,
@ -76,64 +135,11 @@ def get_parametrized_options(
        test_settings, test_type, create_new_process_for_each_test
    )

-    # Ensure that something is wrapped as an iterable it's not already
-    ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)
-
-    def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
-        # This is essentially the same as nesting a bunch of mark.parametrize
-        # decorators, but we do it programmatically to allow overrides for on
-        # a per-model basis, while still being able to execute each of these
-        # as individual test cases in pytest.
-        iter_kwargs = OrderedDict(
-            [
-                ("model", ensure_wrapped(test_info.models)),
-                ("max_tokens", ensure_wrapped(test_info.max_tokens)),
-                ("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
-                ("dtype", ensure_wrapped(test_info.dtype)),
-                (
-                    "distributed_executor_backend",
-                    ensure_wrapped(test_info.distributed_executor_backend),
-                ),
-            ]
-        )
-
-        # num_frames is video only
-        if test_type == VLMTestType.VIDEO:
-            iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)
-            iter_kwargs["needs_video_metadata"] = ensure_wrapped(
-                test_info.needs_video_metadata
-            )
-
-        # No sizes passed for custom inputs, since inputs are directly provided
-        if test_type not in (VLMTestType.CUSTOM_INPUTS, VLMTestType.AUDIO):
-            wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
-            if wrapped_sizes is None:
-                raise ValueError(f"Sizes must be set for test type {test_type}")
-            iter_kwargs["size_wrapper"] = wrapped_sizes
-
-        # Otherwise expand the custom test options instead
-        elif test_type == VLMTestType.CUSTOM_INPUTS:
-            if test_info.custom_test_opts is None:
-                raise ValueError("Test has type CUSTOM_INPUTS, but none given")
-            iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
-
-        # Wrap all model cases in a pytest parameter & pass marks through
-        return [
-            pytest.param(
-                model_type,
-                ExpandableVLMTestArgs(
-                    **{k: v for k, v in zip(iter_kwargs.keys(), case)}
-                ),
-                marks=test_info.marks if test_info.marks is not None else [],
-            )
-            for case in list(itertools.product(*iter_kwargs.values()))
-        ]
-
    # Get a list per model type, where each entry contains a tuple of all of
    # that model type's cases, then flatten them into the top level so that
    # we can consume them in one mark.parametrize call.
    cases_by_model_type = [
-        get_model_type_cases(model_type, test_info)
+        get_model_type_cases(model_type, test_info, test_type)
        for model_type, test_info in matching_tests.items()
    ]
    return list(itertools.chain(*cases_by_model_type))
--- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
+++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
@ -140,7 +140,7 @@ def video_with_metadata_glm4_1v():
    metadata = VIDEO_ASSETS[0].metadata
    question = "Describe the video."
    video_prompt = "<|begin_of_video|><|video|><|end_of_video|>"
-    formatted_prompt = f"<|user|>\n{video_prompt}{question}<|assistant|>\n"
+    formatted_prompt = f"[gMASK]<|user|>\n{video_prompt}{question}<|assistant|>\n"

    scales = [0.1, 0.2, 0.25]
    video_input = [
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@ -25,6 +25,7 @@ from transformers import (
 from transformers.video_utils import VideoMetadata

 from vllm.logprobs import SampleLogprobs
+from vllm.platforms import current_platform
 from vllm.utils.collection_utils import is_list_of

 from .....conftest import HfRunner, ImageAsset, ImageTestAssets
@ -366,6 +367,40 @@ def gemma3_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOut

 def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    """Patches and returns an instance of the HfRunner to use for GLM4V."""
+    if current_platform.is_rocm():
+        import types
+
+        config = hf_model.model.config
+        if hasattr(config, "num_layers") and not hasattr(config, "num_hidden_layers"):
+            config.num_hidden_layers = config.num_layers
+        config.output_hidden_states = True
+
+        def patched_prepare_cache(
+            self, generation_config, model_kwargs, *args, **kwargs
+        ):
+            model_kwargs["past_key_values"] = None
+            model_kwargs["use_cache"] = False
+            return model_kwargs
+
+        hf_model.model._prepare_cache_for_generation = types.MethodType(
+            patched_prepare_cache, hf_model.model
+        )
+        original_generate = hf_model.model.generate
+
+        def patched_generate(*args, **kwargs):
+            kwargs["output_hidden_states"] = True
+            kwargs["return_dict_in_generate"] = True
+            return original_generate(*args, **kwargs)
+
+        hf_model.model.generate = patched_generate
+        original_forward = hf_model.model.forward
+
+        def patched_forward(*args, **kwargs):
+            kwargs["output_hidden_states"] = True
+            return original_forward(*args, **kwargs)
+
+        hf_model.model.forward = patched_forward
+
    hf_processor = hf_model.processor

    def processor(*args, text="", images=None, **kwargs):
@ -406,7 +441,15 @@ def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        if videos is not None and is_list_of(videos, tuple):
            # If videos is a list of tuples, we assume each tuple contains
            # (video_array, metadata) as in the case of GLM4.1V.
-            video_metadata = [[VideoMetadata(**video[1])] for video in videos]
+            # Filter out 'do_sample_frames' as it's not a valid VideoMetadata arg
+            video_metadata = [
+                [
+                    VideoMetadata(
+                        **{k: v for k, v in video[1].items() if k != "do_sample_frames"}
+                    )
+                ]
+                for video in videos
+            ]
            videos = [[video[0]] for video in videos]
        else:
            video_metadata = None
--- a/tests/models/multimodal/generation/vlm_utils/types.py
+++ b/tests/models/multimodal/generation/vlm_utils/types.py
@ -50,8 +50,8 @@ MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PL
 VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"


-IMAGE_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
-EMBEDDING_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0)]
+IMAGE_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
+EMBEDDING_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0)]
 RunnerOutput = tuple[list[int], str, SampleLogprobs | None]


--- a/tests/models/multimodal/pooling/conftest.py
+++ b/tests/models/multimodal/pooling/conftest.py
@ -0,0 +1,24 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Pytest configuration for vLLM pooling tests."""
+
+import os
+import warnings
+
+from vllm.platforms import current_platform
+
+
+def pytest_collection_modifyitems(config, items):
+    """Set FLEX_ATTENTION backend for SigLIP tests on ROCm."""
+    if not current_platform.is_rocm():
+        return
+
+    siglip_tests = [item for item in items if "test_siglip" in item.nodeid]
+
+    if siglip_tests:
+        os.environ["VLLM_ATTENTION_BACKEND"] = "FLEX_ATTENTION"
+        warnings.warn(
+            "ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests",
+            UserWarning,
+            stacklevel=1,
+        )
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@ -396,28 +396,6 @@ def test_processing_correctness(
    )


-# Phi4MultimodalForCausalLM share same model repo with original format
-# Phi4MMForCausalLM, so we add it as a separate test case
-# Remove this test after conversion PR merged:
-# https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/70
-@pytest.mark.parametrize("model_arch", ["Phi4MultimodalForCausalLM"])
-@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
-@pytest.mark.parametrize("num_batches", [32])
-@pytest.mark.parametrize("simplify_rate", [1.0])
-def test_processing_correctness_phi4_multimodal(
-    model_arch: str,
-    hit_rate: float,
-    num_batches: int,
-    simplify_rate: float,
-):
-    _test_processing_correctness(
-        model_arch,
-        hit_rate=hit_rate,
-        num_batches=num_batches,
-        simplify_rate=simplify_rate,
-    )
-
-
 def _assert_inputs_equal(
    a: MultiModalInputs,
    b: MultiModalInputs,
--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@ -47,6 +47,12 @@ QWEN2_CONFIG = GGUFTestConfig(
    gguf_filename="qwen2.5-1.5b-instruct-q6_k.gguf",
 )

+QWEN3_CONFIG = GGUFTestConfig(
+    original_model="Qwen/Qwen3-0.6B",
+    gguf_repo="unsloth/Qwen3-0.6B-GGUF",
+    gguf_filename="Qwen3-0.6B-BF16.gguf",
+)
+
 PHI3_CONFIG = GGUFTestConfig(
    original_model="microsoft/Phi-3.5-mini-instruct",
    gguf_repo="bartowski/Phi-3.5-mini-instruct-GGUF",
@ -87,6 +93,7 @@ GEMMA3_CONFIG = GGUFTestConfig(
 MODELS = [
    # LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458
    QWEN2_CONFIG,
+    QWEN3_CONFIG,
    PHI3_CONFIG,
    GPT2_CONFIG,
    STABLELM_CONFIG,
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -667,6 +667,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        "moonshotai/Kimi-VL-A3B-Instruct",
        extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},
        trust_remote_code=True,
+        max_transformers_version="4.53.3",
+        transformers_version_reason="HF model uses deprecated transformers API "
+        "(PytorchGELUTanh, DynamicCache.seen_tokens, and more). See: "
+        "https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31",
    ),
    "LightOnOCRForConditionalGeneration": _HfExamplesInfo(
        "lightonai/LightOnOCR-1B",
@ -767,10 +771,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "Phi4MMForCausalLM": _HfExamplesInfo(
        "microsoft/Phi-4-multimodal-instruct", trust_remote_code=True
    ),
-    "Phi4MultimodalForCausalLM": _HfExamplesInfo(
-        "microsoft/Phi-4-multimodal-instruct",
-        revision="refs/pr/70",
-    ),
    "PixtralForConditionalGeneration": _HfExamplesInfo(
        "mistralai/Pixtral-12B-2409",
        extras={
--- a/tests/reasoning/test_base_thinking_reasoning_parser.py
+++ b/tests/reasoning/test_base_thinking_reasoning_parser.py
@ -112,7 +112,7 @@ class TestBaseThinkingReasoningParserMethods:
        """Test the is_reasoning_end method."""
        parser = TestThinkingReasoningParser(test_tokenizer)
        end_token_id = parser.end_token_id
-
+        start_token_id = parser.start_token_id
        # Test with end token present
        assert parser.is_reasoning_end([1, 2, end_token_id, 4]) is True

@ -122,6 +122,16 @@ class TestBaseThinkingReasoningParserMethods:
        # Test with empty list
        assert parser.is_reasoning_end([]) is False

+        # Test with interleaved thinking
+        assert parser.is_reasoning_end([1, start_token_id, 2, end_token_id]) is True
+        assert parser.is_reasoning_end([1, start_token_id, 2, 3]) is False
+        assert (
+            parser.is_reasoning_end(
+                [1, start_token_id, 2, end_token_id, 2, 2, start_token_id]
+            )
+            is False
+        )
+
    def test_extract_content_ids(self, test_tokenizer):
        """Test the extract_content_ids method."""
        parser = TestThinkingReasoningParser(test_tokenizer)
--- a/tests/standalone_tests/python_only_compile.sh
+++ b/tests/standalone_tests/python_only_compile.sh
@ -5,6 +5,10 @@
 set -e
 set -x

+merge_base_commit=$(git merge-base HEAD origin/main)
+echo "Current merge base commit with main: $merge_base_commit"
+git show --oneline -s $merge_base_commit
+
 cd /vllm-workspace/

 # uninstall vllm
@ -18,7 +22,7 @@ apt autoremove -y

 echo 'import os; os.system("touch /tmp/changed.file")' >> vllm/__init__.py

-VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL=1 VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e .
+VLLM_PRECOMPILED_WHEEL_COMMIT=$merge_base_commit VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e .

 # Run the script
 python3 -c 'import vllm'
--- a/tests/test_config.py
+++ b/tests/test_config.py
@ -629,8 +629,8 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
        (
            "internlm/internlm2-1_8b-reward",
            "decoder",
-            False,
-            "Pooling models with all pooling does not support chunked prefill.",
+            True,
+            "Pooling models with causal attn and all pooling support chunked prefill.",
        ),
        (
            "BAAI/bge-base-en",
@ -748,8 +748,8 @@ def test_is_chunked_prefill_supported(
        (
            "internlm/internlm2-1_8b-reward",
            "decoder",
-            False,
-            "Pooling models with all pooling does not support prefix caching.",
+            True,
+            "Pooling models with causal attn and all pooling support prefix caching.",
        ),
        (
            "BAAI/bge-base-en",
--- a/tests/test_envs.py
+++ b/tests/test_envs.py
@ -365,3 +365,54 @@ class TestEnvSetWithChoices:
        with patch.dict(os.environ, {"TEST_ENV": "option1,option1,option2"}):
            env_func = env_set_with_choices("TEST_ENV", [], ["option1", "option2"])
            assert env_func() == {"option1", "option2"}
+
+
+class TestVllmConfigureLogging:
+    """Test cases for VLLM_CONFIGURE_LOGGING environment variable."""
+
+    def test_configure_logging_defaults_to_true(self):
+        """Test that VLLM_CONFIGURE_LOGGING defaults to True when not set."""
+        # Ensure the env var is not set
+        with patch.dict(os.environ, {}, clear=False):
+            if "VLLM_CONFIGURE_LOGGING" in os.environ:
+                del os.environ["VLLM_CONFIGURE_LOGGING"]
+
+            # Clear cache if it exists
+            if hasattr(envs.__getattr__, "cache_clear"):
+                envs.__getattr__.cache_clear()
+
+            result = envs.VLLM_CONFIGURE_LOGGING
+            assert result is True
+            assert isinstance(result, bool)
+
+    def test_configure_logging_with_zero_string(self):
+        """Test that VLLM_CONFIGURE_LOGGING='0' evaluates to False."""
+        with patch.dict(os.environ, {"VLLM_CONFIGURE_LOGGING": "0"}):
+            # Clear cache if it exists
+            if hasattr(envs.__getattr__, "cache_clear"):
+                envs.__getattr__.cache_clear()
+
+            result = envs.VLLM_CONFIGURE_LOGGING
+            assert result is False
+            assert isinstance(result, bool)
+
+    def test_configure_logging_with_one_string(self):
+        """Test that VLLM_CONFIGURE_LOGGING='1' evaluates to True."""
+        with patch.dict(os.environ, {"VLLM_CONFIGURE_LOGGING": "1"}):
+            # Clear cache if it exists
+            if hasattr(envs.__getattr__, "cache_clear"):
+                envs.__getattr__.cache_clear()
+
+            result = envs.VLLM_CONFIGURE_LOGGING
+            assert result is True
+            assert isinstance(result, bool)
+
+    def test_configure_logging_with_invalid_value_raises_error(self):
+        """Test that invalid VLLM_CONFIGURE_LOGGING value raises ValueError."""
+        with patch.dict(os.environ, {"VLLM_CONFIGURE_LOGGING": "invalid"}):
+            # Clear cache if it exists
+            if hasattr(envs.__getattr__, "cache_clear"):
+                envs.__getattr__.cache_clear()
+
+            with pytest.raises(ValueError, match="invalid literal for int"):
+                _ = envs.VLLM_CONFIGURE_LOGGING
--- a/tests/tool_use/test_mistral_tool_parser.py
+++ b/tests/tool_use/test_mistral_tool_parser.py
@ -0,0 +1,847 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Generator
+
+import partial_json_parser
+import pytest
+from mistral_common.protocol.instruct.messages import AssistantMessage
+from mistral_common.protocol.instruct.request import InstructRequest
+from mistral_common.protocol.instruct.tool_calls import FunctionCall, ToolCall
+from partial_json_parser.core.options import Allow
+
+from vllm.entrypoints.openai.protocol import DeltaMessage, DeltaToolCall
+from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolParser
+from vllm.tokenizers import (
+    MistralTokenizer,
+    TokenizerLike,
+    get_tokenizer,
+)
+from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
+
+
+@pytest.fixture(scope="module")
+def mistral_pre_v11_tokenizer():
+    MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
+    return get_tokenizer(tokenizer_name=MODEL)
+
+
+@pytest.fixture(scope="module")
+def mistral_tokenizer():
+    MODEL = "mistralai/Mistral-Small-3.2-24B-Instruct-2506"
+    return get_tokenizer(tokenizer_name=MODEL, tokenizer_mode="mistral")
+
+
+@pytest.fixture
+def mistral_pre_v11_tool_parser(mistral_pre_v11_tokenizer):
+    return MistralToolParser(mistral_pre_v11_tokenizer)
+
+
+@pytest.fixture
+def mistral_tool_parser(mistral_tokenizer):
+    return MistralToolParser(mistral_tokenizer)
+
+
+def assert_tool_calls(
+    actual_tool_calls: list[ToolCall] | list[DeltaToolCall],
+    expected_tool_calls: list[ToolCall],
+):
+    assert len(actual_tool_calls) == len(expected_tool_calls)
+
+    for actual_tool_call, expected_tool_call in zip(
+        actual_tool_calls, expected_tool_calls
+    ):
+        assert isinstance(actual_tool_call.id, str)
+        assert len(actual_tool_call.id) == 9
+
+        if isinstance(actual_tool_call, ToolCall):
+            assert actual_tool_call.type == "function"
+        elif isinstance(actual_tool_call, DeltaToolCall):
+            assert actual_tool_call.function is not None
+            assert actual_tool_call.function.name is not None
+            assert actual_tool_call.function.arguments is not None
+        assert actual_tool_call.function is not None
+        assert actual_tool_call.function.name == expected_tool_call.function.name, (
+            f"got wrong function name:${actual_tool_call.function.name}"
+        )
+        assert (
+            actual_tool_call.function.arguments == expected_tool_call.function.arguments
+        ), f"got wrong function argument:${actual_tool_call.function.arguments}"
+
+
+def fix_tool_call_tokenization(
+    tokens: list[int],
+    mistral_tool_parser: MistralToolParser,
+    mistral_tokenizer: TokenizerLike,
+):
+    """
+    Replaces the textual token sequence for [TOOL_CALLS]
+    with its single special token ID.
+    """
+    textual_tool_call_token_ids = mistral_tokenizer.encode(
+        text=mistral_tool_parser.bot_token,
+        add_special_tokens=False,
+    )
+    # textual_tool_call_token_ids must not contain special tokens like bos, eos etc
+    special_tool_call_token_ids = [mistral_tool_parser.bot_token_id]
+
+    # If the input is too short to contain the sequence, no replacement is possible
+    if not tokens or len(tokens) < len(textual_tool_call_token_ids):
+        return tokens
+
+    result_tokens = []
+    i = 0
+    target_len = len(textual_tool_call_token_ids)
+
+    while i < len(tokens):
+        # Check if the slice from the current position matches the target sequence
+        if tokens[i : i + target_len] == textual_tool_call_token_ids:
+            # If it matches, add the replacement and jump the index forward
+            result_tokens.extend(special_tool_call_token_ids)
+            i += target_len
+        else:
+            # Otherwise, just add the current token and move to the next one
+            result_tokens.append(tokens[i])
+            i += 1
+
+    return result_tokens
+
+
+def stream_delta_message_generator(
+    mistral_tool_parser: MistralToolParser,
+    mistral_tokenizer: TokenizerLike,
+    model_output: str | None,
+    tools: list[tuple[str, str]] | None,
+) -> Generator[DeltaMessage, None, None]:
+    if (
+        isinstance(mistral_tokenizer, MistralTokenizer)
+        and mistral_tokenizer.version >= 11
+    ):
+        # With the newer versions of the tokenizer,
+        # we cannot tokenize free text
+        # so we need to create a list of messages to get tokenized
+        assert tools is not None
+        assistant_msg = AssistantMessage(
+            tool_calls=[
+                ToolCall(
+                    function=FunctionCall(
+                        name=name,
+                        arguments=arg,
+                    )
+                )
+                for (name, arg) in tools
+            ],
+        )
+        request = InstructRequest(
+            messages=[assistant_msg],
+        )
+        all_token_ids = mistral_tokenizer.instruct.encode_instruct(request).tokens
+    else:
+        # Older versions of the tokenizer are
+        # able to encode directly the model's output (free text) into tokens
+        assert model_output is not None
+        all_token_ids = mistral_tokenizer.encode(model_output, add_special_tokens=False)
+
+    all_token_ids = fix_tool_call_tokenization(
+        all_token_ids, mistral_tool_parser, mistral_tokenizer
+    )
+
+    previous_text = ""
+    previous_tokens = None
+    prefix_offset = 0
+    read_offset = 0
+    for i, delta_token in enumerate(all_token_ids):
+        delta_token_ids = [delta_token]
+        previous_token_ids = all_token_ids[:i]
+        current_token_ids = all_token_ids[: i + 1]
+
+        (new_tokens, delta_text, new_prefix_offset, new_read_offset) = (
+            detokenize_incrementally(
+                tokenizer=mistral_tokenizer,
+                all_input_ids=current_token_ids,
+                prev_tokens=previous_tokens,
+                prefix_offset=prefix_offset,
+                read_offset=read_offset,
+                skip_special_tokens=isinstance(mistral_tokenizer, MistralTokenizer),
+                spaces_between_special_tokens=True,
+            )
+        )
+
+        current_text = previous_text + delta_text
+
+        delta_message = mistral_tool_parser.extract_tool_calls_streaming(
+            previous_text,
+            current_text,
+            delta_text,
+            previous_token_ids,
+            current_token_ids,
+            delta_token_ids,
+            request=None,  # type: ignore[arg-type]
+        )
+        if delta_message:
+            yield delta_message
+
+        previous_text = current_text
+        previous_tokens = (
+            previous_tokens + new_tokens if previous_tokens else new_tokens
+        )
+        prefix_offset = new_prefix_offset
+        read_offset = new_read_offset
+
+
+def test_extract_tool_calls_no_tools(mistral_pre_v11_tool_parser):
+    model_output = "This is a test"
+    extracted_tool_calls = mistral_pre_v11_tool_parser.extract_tool_calls(
+        model_output, request=None
+    )  # type: ignore[arg-type]
+    assert not extracted_tool_calls.tools_called
+    assert extracted_tool_calls.tool_calls == []
+    assert extracted_tool_calls.content == model_output
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "single_tool_add",
+        "single_tool_weather",
+        "argument_before_name",
+        "argument_before_name_and_name_in_argument",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        (
+            """[TOOL_CALLS][{"name": "add", "arguments":{"a": 3.5, "b": 4}}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3.5, "b": 4})
+                    )
+                )
+            ],
+            None,
+        ),
+        (
+            """[TOOL_CALLS] [{"name": "get_current_weather", "arguments":{"city": "San Francisco", "state": "CA", "unit": "celsius"}}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                )
+            ],
+            None,
+        ),
+        (
+            """[TOOL_CALLS] [{"arguments":{"city": "San Francisco", "state": "CA", "unit": "celsius"}, "name": "get_current_weather"}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                )
+            ],
+            None,
+        ),
+        (
+            """[TOOL_CALLS] [{"arguments":{"name": "John Doe"}, "name": "get_age"}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_age",
+                        arguments=json.dumps(
+                            {
+                                "name": "John Doe",
+                            }
+                        ),
+                    )
+                )
+            ],
+            None,
+        ),
+    ],
+)
+def test_extract_tool_calls_pre_v11_tokenizer(
+    mistral_pre_v11_tool_parser, model_output, expected_tool_calls, expected_content
+):
+    extracted_tool_calls = mistral_pre_v11_tool_parser.extract_tool_calls(
+        model_output, request=None
+    )  # type: ignore[arg-type]
+    assert extracted_tool_calls.tools_called
+
+    assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
+
+    assert extracted_tool_calls.content == expected_content
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "single_tool_add",
+        "single_tool_weather",
+        "multiple_tool_calls",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        (
+            """[TOOL_CALLS]add_this_and_that{"a": 3.5, "b": 4}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add_this_and_that",
+                        arguments=json.dumps({"a": 3.5, "b": 4}),
+                    )
+                )
+            ],
+            None,
+        ),
+        (
+            """[TOOL_CALLS]get_current_weather{"city": "San Francisco", "state": "CA", "unit": "celsius"}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                )
+            ],
+            None,
+        ),
+        (
+            """[TOOL_CALLS]add{"a": 3.5, "b": 4}[TOOL_CALLS]multiply{"a": 3, "b": 6}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3.5, "b": 4})
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="multiply", arguments=json.dumps({"a": 3, "b": 6})
+                    )
+                ),
+            ],
+            None,
+        ),
+    ],
+)
+def test_extract_tool_calls(
+    mistral_tool_parser, model_output, expected_tool_calls, expected_content
+):
+    extracted_tool_calls = mistral_tool_parser.extract_tool_calls(
+        model_output, request=None
+    )  # type: ignore[arg-type]
+    assert extracted_tool_calls.tools_called
+
+    assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
+
+    assert extracted_tool_calls.content == expected_content
+
+
+def _test_extract_tool_calls_streaming(
+    tool_parser, tokenizer, model_output, tools, expected_tool_calls, expected_content
+):
+    other_content: str = ""
+    function_names: list[str] = []
+    function_args_strs: list[str] = []
+    tool_call_idx: int = -1
+    tool_call_ids: list[str | None] = []
+
+    for delta_message in stream_delta_message_generator(
+        tool_parser, tokenizer, model_output, tools
+    ):
+        # role should never be streamed from tool parser
+        assert not delta_message.role
+
+        if delta_message.content:
+            other_content += delta_message.content
+
+        streamed_tool_calls = delta_message.tool_calls
+
+        if streamed_tool_calls and len(streamed_tool_calls) > 0:
+            # make sure only one diff is present - correct even for parallel
+            assert len(streamed_tool_calls) == 1
+            tool_call = streamed_tool_calls[0]
+
+            assert len(tool_parser.prev_tool_call_arr) > 0
+
+            # if a new tool is being called, set up empty arguments
+            if tool_call.index != tool_call_idx:
+                tool_call_idx = tool_call.index
+                function_args_strs.append("")
+                tool_call_ids.append(None)
+
+            # if a tool call ID is streamed, make sure one hasn't been already
+            if tool_call.id and not tool_call_ids[tool_call.index]:
+                tool_call_ids[tool_call.index] = tool_call.id
+
+            # if parts of the function start being streamed
+            if tool_call.function:
+                # if the function name is defined, set it. it should be streamed
+                # IN ENTIRETY, exactly one time.
+                if tool_call.function.name:
+                    assert isinstance(tool_call.function.name, str)
+                    function_names.append(tool_call.function.name)
+
+                if tool_call.function.arguments:
+                    # make sure they're a string and then add them to the list
+                    assert isinstance(tool_call.function.arguments, str)
+
+                    function_args_strs[tool_call.index] += tool_call.function.arguments
+
+    assert other_content == expected_content
+
+    actual_tool_calls = [
+        ToolCall(
+            id=tool_call_id,
+            function=FunctionCall(
+                name=function_name,
+                arguments=partial_json_parser.ensure_json(
+                    function_args_str, Allow.OBJ | Allow.STR
+                ),
+            ),
+        )
+        for tool_call_id, function_name, function_args_str in zip(
+            tool_call_ids, function_names, function_args_strs
+        )
+    ]
+    assert_tool_calls(actual_tool_calls, expected_tool_calls)
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "no_tools",
+        "single_tool_add",
+        "single_tool_add_strings",
+        "single_tool_weather",
+        "argument_before_name",
+        "argument_before_name_and_name_in_argument",
+        "multiple_tools",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        ("""This is a test""", [], """This is a test"""),
+        (
+            """[TOOL_CALLS]  [ {"name":"add" , "arguments" : {"a": 3, "b": 4} } ]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3, "b": 4})
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"name": "add", "arguments":{"a": "3", "b": "4"}}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": "3", "b": "4"})
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"city": "San Francisco", "state": "CA", "unit": "celsius"}}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"arguments": {"city": "San Francisco", "state": "CA", "unit": "celsius"}, "name": "get_current_weather"}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"arguments": {"name": "John Doe"}, "name": "get_age"}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_age",
+                        arguments=json.dumps(
+                            {
+                                "name": "John Doe",
+                            }
+                        ),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"name": "add", "arguments": {"a": 3.5, "b": 4}}, {"name": "get_current_weather", "arguments":{"city": "San Francisco", "state": "CA", "unit": "celsius"}}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3.5, "b": 4})
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                ),
+            ],
+            "",
+        ),
+    ],
+)
+def test_extract_tool_calls_streaming_pre_v11_tokenizer(
+    mistral_pre_v11_tool_parser,
+    mistral_pre_v11_tokenizer,
+    model_output,
+    expected_tool_calls,
+    expected_content,
+):
+    _test_extract_tool_calls_streaming(
+        mistral_pre_v11_tool_parser,
+        mistral_pre_v11_tokenizer,
+        model_output,
+        None,
+        expected_tool_calls,
+        expected_content,
+    )
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "single_tool_add",
+        "single_tool_add_strings",
+        "multiple_tools",
+    ],
+    argnames=["tools", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        (
+            [("add", '{"a": 3, "b": 4}')],
+            # [TOOL_CALLS]add{"a": 3, "b": 4}
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3, "b": 4})
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            [("add_two_strings", '{"a": "3", "b": "4"}')],
+            # [TOOL_CALLS]add_two_strings{"a": "3", "b": "4"}
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add_two_strings",
+                        arguments=json.dumps({"a": "3", "b": "4"}),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            [
+                ("add", '{"a": 3.5, "b": 4}'),
+                (
+                    "get_current_weather",
+                    '{"city": "San Francisco", "state": "CA", "unit": "celsius"}',  # noqa: E501
+                ),
+            ],
+            # [TOOL_CALLS]add{"a": 3.5, "b": 4}[TOOL_CALLS]get_current_weather{"city": "San Francisco", "state": "CA", "unit": "celsius"}  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3.5, "b": 4})
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                ),
+            ],
+            "",
+        ),
+    ],
+)
+def test_extract_tool_calls_streaming(
+    mistral_tool_parser,
+    mistral_tokenizer,
+    tools,
+    expected_tool_calls,
+    expected_content,
+):
+    _test_extract_tool_calls_streaming(
+        mistral_tool_parser,
+        mistral_tokenizer,
+        None,
+        tools,
+        expected_tool_calls,
+        expected_content,
+    )
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "single_tool_add",
+        "single_tool_weather",
+        "multiple_tool_calls",
+        "content_before_tool",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        (
+            """[TOOL_CALLS]add_this_and_that{"a": 3.5, "b": 4}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add_this_and_that",
+                        arguments=json.dumps({"a": 3.5, "b": 4}),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS]get_current_weather{"city": "San Francisco", "state": "CA", "unit": "celsius"}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS]add{"a": 3.5, "b": 4}[TOOL_CALLS]multiply{"a": 3, "b": 6}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3.5, "b": 4})
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="multiply", arguments=json.dumps({"a": 3, "b": 6})
+                    )
+                ),
+            ],
+            "",
+        ),
+        (
+            # Additional content should not be after the tool calls
+            """bla[TOOL_CALLS]add_this_and_that{"a": 3.5, "b": 4}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add_this_and_that",
+                        arguments=json.dumps({"a": 3.5, "b": 4}),
+                    )
+                )
+            ],
+            "bla",
+        ),
+    ],
+)
+def test_extract_tool_calls_streaming_one_chunk(
+    mistral_tool_parser,
+    mistral_tokenizer,
+    model_output,
+    expected_tool_calls,
+    expected_content,
+):
+    if isinstance(mistral_tokenizer, MistralTokenizer):
+        all_token_ids = mistral_tokenizer.encode(model_output)
+    else:
+        all_token_ids = mistral_tokenizer.encode(model_output, add_special_tokens=False)
+    all_token_ids = fix_tool_call_tokenization(
+        all_token_ids, mistral_tool_parser, mistral_tokenizer
+    )
+
+    delta_message = mistral_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text=model_output,
+        delta_text=model_output,
+        previous_token_ids=[],
+        current_token_ids=all_token_ids,
+        delta_token_ids=all_token_ids,
+        request=None,
+    )  # type: ignore[arg-type]
+    assert isinstance(delta_message, DeltaMessage)
+    assert len(delta_message.tool_calls) == len(expected_tool_calls)
+
+    assert_tool_calls(delta_message.tool_calls, expected_tool_calls)
+
+    if delta_message.content is None:
+        assert expected_content == ""
+    else:
+        assert delta_message.content == expected_content
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "no_tools",
+        "single_tool_add",
+        "single_tool_add_strings",
+        "single_tool_weather",
+        "argument_before_name",
+        "argument_before_name_and_name_in_argument",
+        "multiple_tools",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        ("""This is a test""", [], """This is a test"""),
+        (
+            """[TOOL_CALLS]  [ {"name":"add" , "arguments" : {"a": 3, "b": 4} } ]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3, "b": 4})
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"name": "add", "arguments":{"a": "3", "b": "4"}}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": "3", "b": "4"})
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"city": "San Francisco", "state": "CA", "unit": "celsius"}}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"arguments": {"city": "San Francisco", "state": "CA", "unit": "celsius"}, "name": "get_current_weather"}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"arguments": {"name": "John Doe"}, "name": "get_age"}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_age",
+                        arguments=json.dumps(
+                            {
+                                "name": "John Doe",
+                            }
+                        ),
+                    )
+                )
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS] [{"arguments": {"a": 3.5, "b": 4}, "name": "add"}, {"arguments":{"city": "San Francisco", "state": "CA", "unit": "celsius"}, "name": "get_current_weather"}]""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="add", arguments=json.dumps({"a": 3.5, "b": 4})
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "San Francisco", "state": "CA", "unit": "celsius"}
+                        ),
+                    )
+                ),
+            ],
+            "",
+        ),
+    ],
+)
+def test_extract_tool_calls_streaming_pre_v11_tokenizer_one_chunk(
+    mistral_pre_v11_tool_parser,
+    mistral_pre_v11_tokenizer,
+    model_output,
+    expected_tool_calls,
+    expected_content,
+):
+    if isinstance(mistral_pre_v11_tokenizer, MistralTokenizer):
+        all_token_ids = mistral_pre_v11_tokenizer.encode(model_output)
+    else:
+        all_token_ids = mistral_pre_v11_tokenizer.encode(
+            model_output, add_special_tokens=False
+        )
+    all_token_ids = fix_tool_call_tokenization(
+        all_token_ids, mistral_pre_v11_tool_parser, mistral_pre_v11_tokenizer
+    )
+
+    delta_message = mistral_pre_v11_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text=model_output,
+        delta_text=model_output,
+        previous_token_ids=[],
+        current_token_ids=all_token_ids,
+        delta_token_ids=all_token_ids,
+        request=None,
+    )  # type: ignore[arg-type]
+    assert isinstance(delta_message, DeltaMessage)
+    assert len(delta_message.tool_calls) == len(expected_tool_calls)
+
+    assert_tool_calls(delta_message.tool_calls, expected_tool_calls)
+
+    if delta_message.content is None:
+        assert expected_content == ""
+    else:
+        assert delta_message.content == expected_content
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@ -123,7 +123,7 @@ CONFIGS: dict[str, ServerConfig] = {
        "supports_parallel": True,
        "extended": True,
    },
-    "mistral": {
+    "mistral-7b": {
        "model": "mistralai/Mistral-7B-Instruct-v0.3",
        "arguments": [
            "--enforce-eager",
@ -145,6 +145,32 @@ CONFIGS: dict[str, ServerConfig] = {
        "call the tool. Otherwise, answer the user's query directly "
        "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
        "to the user's question - just respond to it normally.",
+        "supports_parallel": True,
+    },
+    "mistral-small-3.2": {
+        "model": "mistralai/Mistral-Small-3.2-24B-Instruct-2506",
+        "arguments": [
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "mistral",
+            "--tokenizer-mode",
+            "mistral",
+            "--config-format",
+            "mistral",
+            "--load-format",
+            "mistral",
+            "--tensor-parallel-size",
+            "4",
+            '--ignore-patterns="consolidated.safetensors"',
+        ],
+        "system_prompt": "You are a helpful assistant with access to tools. If a tool"
+        " that you have would be helpful to answer a user query, "
+        "call the tool. Otherwise, answer the user's query directly "
+        "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
+        "to the user's question - just respond to it normally.",
+        "supports_parallel": True,
+        "extended": True,
    },
    # FIXME: This test currently fails, need to debug why.
    # "granite20b": {
--- a/tests/v1/core/test_reset_prefix_cache_e2e.py
+++ b/tests/v1/core/test_reset_prefix_cache_e2e.py
@ -11,7 +11,9 @@ PROMPTS = [
 ]


-def test_reset_prefix_cache_e2e():
+def test_reset_prefix_cache_e2e(monkeypatch):
+    # "spawn" is required for test to be deterministic
+    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
    engine_args = EngineArgs(
        model="Qwen/Qwen3-0.6B",
        gpu_memory_utilization=0.2,
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@ -9,6 +9,7 @@ from vllm.config import VllmConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.hashing import _xxhash


 def test_prefix_caching_from_cli():
@ -48,6 +49,21 @@ def test_prefix_caching_from_cli():
        args = parser.parse_args(["--prefix-caching-hash-algo", "invalid"])


+@pytest.mark.skipif(_xxhash is None, reason="xxhash not installed")
+def test_prefix_caching_xxhash_from_cli():
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+
+    # set hash algorithm to xxhash (pickle)
+    args = parser.parse_args(["--prefix-caching-hash-algo", "xxhash"])
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert vllm_config.cache_config.prefix_caching_hash_algo == "xxhash"
+
+    # set hash algorithm to xxhash_cbor
+    args = parser.parse_args(["--prefix-caching-hash-algo", "xxhash_cbor"])
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert vllm_config.cache_config.prefix_caching_hash_algo == "xxhash_cbor"
+
+
 def test_defaults_with_usage_context():
    engine_args = EngineArgs(model="facebook/opt-125m")
    vllm_config: VllmConfig = engine_args.create_engine_config(UsageContext.LLM_CLASS)
--- a/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py
+++ b/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py
@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import base64
-import io
 import json

 import openai  # use the official client for correctness check
@ -13,6 +11,7 @@ from transformers import AutoConfig

 from tests.conftest import ImageTestAssets
 from tests.utils import RemoteOpenAIServer
+from vllm.utils.serial_utils import tensor2base64

 # any model with a chat template should work here
 MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
@ -50,18 +49,6 @@ async def client_with_image_embeds(server_with_image_embeds):
        yield async_client


-def encode_image_embedding_to_base64(image_embedding) -> str:
-    """
-    Encode image embedding to base64 string
-    """
-    buffer = io.BytesIO()
-    torch.save(image_embedding, buffer)
-    buffer.seek(0)
-    binary_data = buffer.read()
-    base64_image_embedding = base64.b64encode(binary_data).decode("utf-8")
-    return base64_image_embedding
-
-
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("dtype", [torch.half, torch.float16, torch.float32])
@ -73,7 +60,7 @@ async def test_completions_with_image_embeds(
 ):
    # Test case: Single image embeds input
    image_embeds = image_assets[0].image_embeds.to(dtype=dtype)
-    base64_image_embedding = encode_image_embedding_to_base64(image_embeds)
+    base64_image_embedding = tensor2base64(image_embeds)
    chat_completion = await client_with_image_embeds.chat.completions.create(
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
--- a/tests/v1/kv_connector/unit/test_shared_storage_connector.py
+++ b/tests/v1/kv_connector/unit/test_shared_storage_connector.py
@ -3,12 +3,14 @@
 from dataclasses import asdict
 from typing import NamedTuple

+import pytest
 from PIL import Image

 from vllm import LLM, EngineArgs, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.config import KVTransferConfig
 from vllm.multimodal.utils import encode_image_base64
+from vllm.platforms import current_platform

 MODEL_NAME = "RedHatAI/Qwen2.5-VL-3B-Instruct-quantized.w8a8"

@ -108,6 +110,13 @@ def process_prompt(processor, llm: LLM, question: str, image_urls: list[Image]):
        print("-" * 50)


+@pytest.mark.skipif(
+    current_platform.is_rocm(),
+    reason=(
+        "hipErrorLaunchFailure when running this test, see issue:"
+        "https://github.com/ROCm/pytorch/issues/2822"
+    ),
+)
 def test_shared_storage_connector_hashes(tmp_path):
    """
    Tests that SharedStorageConnector saves KV to the storage locations
--- a/tests/v1/logits_processors/test_custom_offline.py
+++ b/tests/v1/logits_processors/test_custom_offline.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
-import sys
 from typing import Any

 import pytest
@ -10,7 +9,6 @@ from tests.utils import create_new_process_for_each_test
 from tests.v1.logits_processors.utils import (
    DUMMY_LOGITPROC_ARG,
    DUMMY_LOGITPROC_FQCN,
-    DUMMY_LOGITPROC_MODULE,
    MAX_TOKENS,
    MODEL_NAME,
    POOLING_MODEL_NAME,
@ -18,7 +16,6 @@ from tests.v1.logits_processors.utils import (
    CustomLogitprocSource,
    DummyLogitsProcessor,
    WrappedPerReqLogitsProcessor,
-    dummy_module,
    prompts,
 )
 from tests.v1.logits_processors.utils import entry_points as fake_entry_points
@ -162,8 +159,6 @@ def test_custom_logitsprocs(monkeypatch, logitproc_source: CustomLogitprocSource
    kwargs: dict[str, list[str | type[LogitsProcessor]]] = {}
    if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_FQCN:
        # Scenario: load logitproc based on fully-qualified class name (FQCN)
-        # Inject dummy module which defines logitproc
-        sys.modules[DUMMY_LOGITPROC_MODULE] = dummy_module
        kwargs["logits_processors"] = [DUMMY_LOGITPROC_FQCN]
    elif logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_CLASS:
        # Scenario: load logitproc from provided class object
--- a/tests/v1/logits_processors/test_custom_online.py
+++ b/tests/v1/logits_processors/test_custom_online.py
@ -14,11 +14,9 @@ from tests.utils import RemoteOpenAIServerCustom, create_new_process_for_each_te
 from tests.v1.logits_processors.utils import (
    DUMMY_LOGITPROC_ARG,
    DUMMY_LOGITPROC_FQCN,
-    DUMMY_LOGITPROC_MODULE,
    MAX_TOKENS,
    MODEL_NAME,
    TEMP_GREEDY,
-    dummy_module,
    prompts,
 )
 from tests.v1.logits_processors.utils import entry_points as fake_entry_points
@ -47,20 +45,14 @@ def _server_with_logitproc_entrypoint(
    main.main()


-def _server_with_logitproc_module(
+def _server_with_logitproc_fqcn(
    env_dict: dict[str, str] | None,
    model: str,
    vllm_serve_args: list[str],
 ) -> None:
    """Start vLLM server, inject module with dummy logitproc"""
-
-    # Patch `modules` to inject dummy logitproc module
    from vllm.entrypoints.cli import main

-    sys.modules[DUMMY_LOGITPROC_MODULE] = dummy_module
-
-    # fork is required for workers to see entrypoint patch
-    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "fork"
    if env_dict is not None:
        os.environ.update(env_dict)

@ -99,7 +91,7 @@ def server(default_server_args, request, monkeypatch):
    if request.param:
        # Launch server, append FQCN argument, inject dummy logitproc module
        args = default_server_args + request.param
-        _server_fxn = _server_with_logitproc_module
+        _server_fxn = _server_with_logitproc_fqcn
    else:
        # Launch server, inject dummy logitproc entrypoint
        args = default_server_args
--- a/tests/v1/logits_processors/utils.py
+++ b/tests/v1/logits_processors/utils.py
@ -27,7 +27,7 @@ DUMMY_LOGITPROC_ARG = "target_token"
 TEMP_GREEDY = 0.0
 MAX_TOKENS = 20
 DUMMY_LOGITPROC_ENTRYPOINT = "dummy_logitproc"
-DUMMY_LOGITPROC_MODULE = "DummyModule"
+DUMMY_LOGITPROC_MODULE = "tests.v1.logits_processors.utils"
 DUMMY_LOGITPROC_FQCN = f"{DUMMY_LOGITPROC_MODULE}:DummyLogitsProcessor"


--- a/tests/v1/spec_decode/test_speculators_eagle3.py
+++ b/tests/v1/spec_decode/test_speculators_eagle3.py
@ -5,6 +5,7 @@ import torch

 from vllm.config import SpeculativeConfig
 from vllm.model_executor.models.interfaces import supports_eagle3
+from vllm.platforms import current_platform


@pytest.mark.parametrize(
@ -21,6 +22,10 @@ from vllm.model_executor.models.interfaces import supports_eagle3
        pytest.param(
            "nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized-w4a16",
            id="qwen3-eagle3-speculator-w4a16-verifier",
+            marks=pytest.mark.skipif(
+                current_platform.is_rocm(),
+                reason="The tests are skipped on rocm platform.",
+            ),
        ),
    ],
 )
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@ -761,6 +761,10 @@ def test_init_kv_cache_with_kv_sharing_valid():
    assert kv_cache_config_after_init.kv_cache_groups[0].layer_names[1] == layer_1


+@pytest.mark.skipif(
+    current_platform.is_rocm(),
+    reason="Attention backend FLASHINFER is not supported on ROCm.",
+)
 def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
    """
    The GPU model runner creates different views into the
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@ -283,6 +283,28 @@ def _rocm_aiter_grouped_topk_fake(
    pass


+# Cache whether aiter supports FP8 MLA parameters
+_AITER_MLA_SUPPORTS_FP8: bool | None = None
+
+
+def _check_aiter_mla_fp8_support() -> bool:
+    """Check if aiter.mla.mla_decode_fwd supports q_scale and kv_scale parameters."""
+    global _AITER_MLA_SUPPORTS_FP8
+    if _AITER_MLA_SUPPORTS_FP8 is None:
+        try:
+            import inspect
+
+            from aiter.mla import mla_decode_fwd
+
+            sig = inspect.signature(mla_decode_fwd)
+            _AITER_MLA_SUPPORTS_FP8 = (
+                "q_scale" in sig.parameters and "kv_scale" in sig.parameters
+            )
+        except Exception:
+            _AITER_MLA_SUPPORTS_FP8 = False
+    return _AITER_MLA_SUPPORTS_FP8
+
+
 def _rocm_aiter_mla_decode_fwd_impl(
    q: torch.Tensor,
    kv_buffer: torch.Tensor,
@ -299,6 +321,16 @@ def _rocm_aiter_mla_decode_fwd_impl(
 ) -> None:
    from aiter.mla import mla_decode_fwd

+    kwargs = {
+        "sm_scale": sm_scale,
+        "logit_cap": logit_cap,
+    }
+
+    # Only pass q_scale and kv_scale if the aiter library supports them
+    if _check_aiter_mla_fp8_support():
+        kwargs["q_scale"] = q_scale
+        kwargs["kv_scale"] = kv_scale
+
    mla_decode_fwd(
        q,
        kv_buffer.view(-1, 1, 1, q.shape[-1]),
@ -308,10 +340,7 @@ def _rocm_aiter_mla_decode_fwd_impl(
        kv_indices,
        kv_last_page_lens,
        max_seqlen_qo,
-        sm_scale=sm_scale,
-        logit_cap=logit_cap,
-        q_scale=q_scale,
-        kv_scale=kv_scale,
+        **kwargs,
    )


--- a/vllm/compilation/cuda_graph.py
+++ b/vllm/compilation/cuda_graph.py
@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import dataclasses
+from collections import Counter
 from collections.abc import Callable
 from contextlib import ExitStack
 from typing import Any
@ -22,6 +23,99 @@ from vllm.utils.torch_utils import weak_ref_tensors
 logger = init_logger(__name__)


+@dataclasses.dataclass(frozen=True)
+class CUDAGraphStat:
+    num_unpadded_tokens: int
+    num_padded_tokens: int
+    num_paddings: int
+    runtime_mode: str
+
+
+class CUDAGraphLogging:
+    """Aggregate and log cudagraph metrics"""
+
+    COLUMN_HEADERS = [
+        "Unpadded Tokens",
+        "Padded Tokens",
+        "Num Paddings",
+        "Runtime Mode",
+        "Count",
+    ]
+
+    def __init__(self, cg_mode: CUDAGraphMode, cg_capture_sizes: list[int] | None):
+        self.reset()
+        self.cg_mode = str(cg_mode)
+        self.cg_capture_sizes = str(cg_capture_sizes or [])
+
+        self.settings_header = (
+            "**CUDAGraph Config Settings:**\n\n"
+            f"- Mode: {self.cg_mode}\n"
+            f"- Capture sizes: {self.cg_capture_sizes}\n\n"
+            "**CUDAGraph Stats:**\n\n"
+        )
+
+    def reset(self):
+        self.stats = []
+
+    def observe(self, cudagraph_stat: CUDAGraphStat):
+        self.stats.append(cudagraph_stat)
+
+    def generate_metric_table(self) -> str:
+        stats_counts = Counter(self.stats)
+
+        # Convert stats to rows of strings, in descending order of observed frequencies
+        rows = []
+        for stat, count in sorted(
+            stats_counts.items(), key=lambda item: item[1], reverse=True
+        ):
+            rows.append(
+                [
+                    str(stat.num_unpadded_tokens),
+                    str(stat.num_padded_tokens),
+                    str(stat.num_paddings),
+                    stat.runtime_mode,
+                    str(count),
+                ]
+            )
+
+        # Calculate column widths (max of header and data)
+        col_widths = []
+        for i, header_text in enumerate(self.COLUMN_HEADERS):
+            max_width = len(header_text)
+            for row in rows:
+                max_width = max(max_width, len(row[i]))
+            col_widths.append(max_width)
+
+        table_header_list = [
+            h.ljust(w) for h, w in zip(self.COLUMN_HEADERS, col_widths)
+        ]
+        table_header = "| " + " | ".join(table_header_list) + " |\n"
+
+        table_separator = "|" + "|".join("-" * (w + 2) for w in col_widths) + "|\n"
+
+        # Create data rows with proper alignment
+        data_rows = []
+        for row in rows:
+            formatted_row = [
+                str(val).ljust(width) for val, width in zip(row, col_widths)
+            ]
+            data_rows.append("| " + " | ".join(formatted_row) + " |")
+
+        return (
+            self.settings_header
+            + table_header
+            + table_separator
+            + "\n".join(data_rows)
+            + "\n"
+        )
+
+    def log(self, log_fn=logger.info):
+        if not self.stats:
+            return
+        log_fn(self.generate_metric_table())
+        self.reset()
+
+
@dataclasses.dataclass
 class CUDAGraphEntry:
    batch_descriptor: BatchDescriptor
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@ -104,7 +104,8 @@ class FixFunctionalizationPass(VllmInductorPass):
                mutated_args = {1: "result"}
                self.defunctionalize(graph, node, mutated_args)
            elif (
-                at_target
+                hasattr(torch.ops.vllm, "flashinfer_trtllm_fused_allreduce_norm")
+                and at_target
                == torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default
            ):
                mutated_args = {
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@ -30,7 +30,7 @@ CacheDType = Literal[
    "fp8_ds_mla",
 ]
 MambaDType = Literal["auto", "float32"]
-PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor"]
+PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor", "xxhash", "xxhash_cbor"]
 KVOffloadingBackend = Literal["native", "lmcache"]


@ -77,9 +77,21 @@ class CacheConfig:
    """Whether to enable prefix caching."""
    prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
    """Set the hash algorithm for prefix caching:\n
-    - "sha256" uses Pickle for object serialization before hashing.\n
+    - "sha256" uses Pickle for object serialization before hashing. This is the
+    current default, as SHA256 is the most secure choice to avoid potential
+    hash collisions.\n
    - "sha256_cbor" provides a reproducible, cross-language compatible hash. It
-    serializes objects using canonical CBOR and hashes them with SHA-256."""
+    serializes objects using canonical CBOR and hashes them with SHA-256.\n
+    - "xxhash" uses Pickle serialization with xxHash (128-bit) for faster,
+    non-cryptographic hashing. Requires the optional ``xxhash`` package.
+    IMPORTANT: Use of a hashing algorithm that is not considered 
+    cryptographically secure theoretically increases the risk of hash collisions,
+    which can cause undefined behavior or even leak private information in
+    multi-tenant environments. Even if collisions are still very unlikely, it is
+    important to consider your security risk tolerance against the performance
+    benefits before turning this on.\n
+    - "xxhash_cbor" combines canonical CBOR serialization with xxHash for
+    reproducible hashing. Requires the optional ``xxhash`` package."""
    cpu_offload_gb: float = Field(default=0, ge=0)
    """The space in GiB to offload to CPU, per GPU. Default is 0, which means
    no offloading. Intuitively, this argument can be seen as a virtual way to
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@ -4,7 +4,7 @@
 import enum
 from collections import Counter
 from collections.abc import Callable
-from dataclasses import asdict, field
+from dataclasses import field
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar, Literal

@ -13,7 +13,7 @@ from pydantic.dataclasses import dataclass

 import vllm.envs as envs
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
-from vllm.config.utils import config, handle_deprecated
+from vllm.config.utils import config, get_hash_factors, handle_deprecated, hash_factors
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import resolve_obj_by_qualname
@ -196,7 +196,16 @@ class PassConfig:
        Any new fields that affect compilation should be added to the hash.
        Any future fields that don't affect compilation should be excluded.
        """
-        return InductorPass.hash_dict(asdict(self))
+
+        ignored_fields = [
+            "enable_fusion",
+            "enable_attn_fusion",
+            "enable_noop",
+            "enable_sequence_parallelism",
+            "enable_async_tp",
+            "enable_fi_allreduce_fusion",
+        ]
+        return hash_factors(get_hash_factors(self, ignored_factors=ignored_fields))

    @field_validator(
        "fuse_norm_quant",
@ -267,14 +276,6 @@ class PassConfig:
            "v0.13.0 or v1.0.0, whichever is sooner",
        )

-        # Force old flags to None to ensure they are not used
-        self.enable_fusion = None
-        self.enable_attn_fusion = None
-        self.enable_noop = None
-        self.enable_sequence_parallelism = None
-        self.enable_async_tp = None
-        self.enable_fi_allreduce_fusion = None
-
        if not self.eliminate_noops:
            if self.fuse_norm_quant or self.fuse_act_quant:
                logger.warning_once(
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@ -84,7 +84,7 @@ TaskOption = Literal[
    "transcription",
    "draft",
 ]
-TokenizerMode = Literal["auto", "hf", "slow", "mistral"]
+TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
 ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
 LogprobsMode = Literal[
    "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
@ -141,6 +141,7 @@ class ModelConfig:
    - "hf" will use the fast tokenizer if available.\n
    - "slow" will always use the slow tokenizer.\n
    - "mistral" will always use the tokenizer from `mistral_common`.\n
+    - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
    - Other custom values can be supported via plugins."""
    trust_remote_code: bool = False
    """Trust remote code (e.g., from HuggingFace) when downloading the model
@ -1779,20 +1780,22 @@ class ModelConfig:
                return False
            elif attn_type == "decoder":
                pooling_type = self.pooler_config.pooling_type.lower()
-                if pooling_type in ["all", "mean", "step", "cls"]:
+                if pooling_type in ["mean", "step", "cls"]:
                    logger.debug(
                        "Pooling models with %s pooling does not "
                        "support chunked prefill.",
                        pooling_type,
                    )
                    return False
-                else:
-                    # pooling_type == "last"
+                elif pooling_type in ["all", "last"]:
                    logger.debug(
-                        "Pooling models with causal attn and last pooling support "
-                        "chunked prefill."
+                        "Pooling models with causal attn and %s pooling support "
+                        "chunked prefill.",
+                        pooling_type,
                    )
                    return True
+                else:
+                    raise ValueError(f"{pooling_type=} not supported.")
            # vllm currently does not have pooling models using hybrid,
            # attention_free or encoder_decoder attn types.
            return attn_type != "encoder_decoder"
@ -1816,20 +1819,22 @@ class ModelConfig:
                return False
            elif attn_type == "decoder":
                pooling_type = self.pooler_config.pooling_type.lower()
-                if pooling_type in ["all", "mean", "step", "cls"]:
+                if pooling_type in ["mean", "step", "cls"]:
                    logger.debug(
                        "Pooling models with %s pooling does not "
                        "support prefix caching.",
                        pooling_type,
                    )
                    return False
-                else:
-                    # pooling_type == "last"
+                elif pooling_type in ["all", "last"]:
                    logger.debug(
-                        "Pooling models with causal attn and last pooling support "
-                        "prefix caching."
+                        "Pooling models with causal attn and %s pooling support "
+                        "prefix caching.",
+                        pooling_type,
                    )
                    return True
+                else:
+                    raise ValueError(f"{pooling_type=} not supported.")
            # vllm currently does not have pooling models using hybrid,
            # attention_free or encoder_decoder attn types.
            return False
--- a/vllm/config/observability.py
+++ b/vllm/config/observability.py
@ -55,6 +55,10 @@ class ObservabilityConfig:
    kv_cache_metrics_sample: float = Field(default=0.01, gt=0, le=1)
    """Sampling rate for KV cache metrics (0.0, 1.0]. Default 0.01 = 1% of blocks."""

+    cudagraph_metrics: bool = False
+    """Enable CUDA graph metrics (number of padded/unpadded tokens, runtime cudagraph
+    dispatch modes, and their observed frequencies at every logging interval)."""
+
    @cached_property
    def collect_model_forward_time(self) -> bool:
        """Whether to collect model forward time for the request."""
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@ -593,10 +593,14 @@ class ParallelConfig:
                "max_parallel_loading_workers is currently "
                "not supported and will be ignored."
            )
-        if self.distributed_executor_backend not in ("mp", "uni") and self.nnodes > 1:
+        allowed_backends = ("mp", "uni", "external_launcher")
+        if (
+            self.distributed_executor_backend not in allowed_backends
+            and self.nnodes > 1
+        ):
            raise ValueError(
                "nnodes > 1 can only be set when distributed executor "
-                "backend is mp or uni."
+                "backend is mp, uni or external_launcher."
            )

    @property
--- a/Show More
+++ b/Show More