From 7151f92241db1bb6ef4eb0fcfed87256646d554e Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 30 Jun 2025 21:01:48 -0700
Subject: [PATCH 001/167] [Misc] Fix spec decode example (#20296)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 examples/offline_inference/spec_decode.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py
index 90d103e5cb05d..3f38aa9fcaa60 100644
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -79,9 +79,7 @@ def main():
         trust_remote_code=True,
         tensor_parallel_size=args.tp,
         enable_chunked_prefill=args.enable_chunked_prefill,
-        max_num_batched_tokens=args.max_num_batched_tokens,
         enforce_eager=args.enforce_eager,
-        max_num_seqs=args.max_num_seqs,
         gpu_memory_utilization=0.8,
         speculative_config=speculative_config,
         disable_log_stats=False,

From 92ee7baaf9a5bf6c8132dde56e4056933c61f50f Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 30 Jun 2025 21:03:55 -0700
Subject: [PATCH 002/167] [Example] add one-click runnable example for P2P NCCL
 XpYd (#20246)

Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
---
 .../disagg_example_p2p_nccl_xpyd.sh           | 245 ++++++++++++++++++
 .../disagg_proxy_p2p_nccl_xpyd.py}            |   0
 2 files changed, 245 insertions(+)
 create mode 100644 examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
 rename examples/online_serving/{disagg_xpyd/disagg_prefill_proxy_xpyd.py => disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py} (100%)

diff --git a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
new file mode 100644
index 0000000000000..2966f386c93a3
--- /dev/null
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
@@ -0,0 +1,245 @@
+#!/bin/bash
+
+# =============================================================================
+# vLLM Disaggregated Serving Script - P2P NCCL XpYd Architecture
+# =============================================================================
+# This script demonstrates disaggregated prefill and decode serving using
+# P2P NCCL communication. The architecture supports various XpYd configurations:
+#
+# - 1P3D: 1 Prefill server + 3 Decode servers (current default)
+# - 3P1D: 3 Prefill servers + 1 Decode server
+# - etc.
+#
+# Configuration can be customized via environment variables:
+#   MODEL: Model to serve
+#   PREFILL_GPUS: Comma-separated GPU IDs for prefill servers
+#   DECODE_GPUS: Comma-separated GPU IDs for decode servers
+#   PREFILL_PORTS: Comma-separated ports for prefill servers
+#   DECODE_PORTS: Comma-separated ports for decode servers
+#   PROXY_PORT: Proxy server port used to setup XpYd connection.
+#   TIMEOUT_SECONDS: Server startup timeout
+# =============================================================================
+
+# Configuration - can be overridden via environment variables
+MODEL=${MODEL:-meta-llama/Llama-3.1-8B-Instruct}
+TIMEOUT_SECONDS=${TIMEOUT_SECONDS:-1200}
+PROXY_PORT=${PROXY_PORT:-30001}
+
+# Default 1P3D configuration (1 Prefill + 3 Decode)
+PREFILL_GPUS=${PREFILL_GPUS:-0}
+DECODE_GPUS=${DECODE_GPUS:-1,2,3}
+PREFILL_PORTS=${PREFILL_PORTS:-20003}
+DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009} 
+
+echo "Warning: P2P NCCL disaggregated prefill XpYd support for vLLM v1 is experimental and subject to change."
+echo ""
+echo "Architecture Configuration:"
+echo "  Model: $MODEL"
+echo "  Prefill GPUs: $PREFILL_GPUS, Ports: $PREFILL_PORTS"
+echo "  Decode GPUs: $DECODE_GPUS, Ports: $DECODE_PORTS"
+echo "  Proxy Port: $PROXY_PORT"
+echo "  Timeout: ${TIMEOUT_SECONDS}s"
+echo ""
+
+PIDS=()
+
+# Switch to the directory of the current script
+cd "$(dirname "${BASH_SOURCE[0]}")"
+
+check_required_files() {
+    local files=("disagg_proxy_p2p_nccl_xpyd.py")
+    for file in "${files[@]}"; do
+        if [[ ! -f "$file" ]]; then
+            echo "Required file $file not found in $(pwd)"
+            exit 1
+        fi
+    done
+}
+
+check_hf_token() {
+    if [ -z "$HF_TOKEN" ]; then
+        echo "HF_TOKEN is not set. Please set it to your Hugging Face token."
+        echo "Example: export HF_TOKEN=your_token_here"
+        exit 1
+    fi
+    if [[ "$HF_TOKEN" != hf_* ]]; then
+        echo "HF_TOKEN is not a valid Hugging Face token. Please set it to your Hugging Face token."
+        exit 1
+    fi
+    echo "HF_TOKEN is set and valid."
+}
+
+check_num_gpus() {
+    # Check if the number of GPUs are >=2 via nvidia-smi
+    num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+    if [ "$num_gpus" -lt 2 ]; then
+        echo "You need at least 2 GPUs to run disaggregated prefill."
+        exit 1
+    else
+        echo "Found $num_gpus GPUs."
+    fi
+}
+
+ensure_python_library_installed() {
+    echo "Checking if $1 is installed..."
+    if ! python3 -c "import $1" > /dev/null 2>&1; then
+        echo "$1 is not installed. Please install it via pip install $1."
+        exit 1
+    else
+        echo "$1 is installed."
+    fi
+}
+
+cleanup() {
+    echo "Stopping everything…"
+    trap - INT TERM        # prevent re-entrancy
+    kill -- -$$            # negative PID  ==  "this whole process-group"
+    wait                   # reap children so we don't leave zombies
+    exit 0
+}
+
+wait_for_server() {
+  local port=$1
+  local timeout_seconds=$TIMEOUT_SECONDS
+  local start_time=$(date +%s)
+
+  echo "Waiting for server on port $port..."
+
+  while true; do
+    if curl -s "localhost:${port}/v1/completions" > /dev/null; then
+      echo "Server on port $port is ready."
+      return 0
+    fi
+
+    local now=$(date +%s)
+    if (( now - start_time >= timeout_seconds )); then
+      echo "Timeout waiting for server on port $port"
+      return 1
+    fi
+
+    sleep 1
+  done
+}
+
+main() {
+    check_required_files
+    check_hf_token
+    check_num_gpus
+    ensure_python_library_installed pandas
+    ensure_python_library_installed datasets
+    ensure_python_library_installed vllm
+    ensure_python_library_installed quart
+
+    trap cleanup INT
+    trap cleanup USR1
+    trap cleanup TERM
+
+    echo "Launching disaggregated serving components..."
+    echo "Please check the log files for detailed output:"
+    echo "  - prefill*.log: Prefill server logs"
+    echo "  - decode*.log: Decode server logs"
+    echo "  - proxy.log: Proxy server log"
+
+    # =============================================================================
+    # Launch Proxy Server
+    # =============================================================================
+    echo ""
+    echo "Starting proxy server on port $PROXY_PORT..."
+    python3 disagg_proxy_p2p_nccl_xpyd.py &
+    PIDS+=($!)
+
+    # Parse GPU and port arrays
+    IFS=',' read -ra PREFILL_GPU_ARRAY <<< "$PREFILL_GPUS"
+    IFS=',' read -ra DECODE_GPU_ARRAY <<< "$DECODE_GPUS"
+    IFS=',' read -ra PREFILL_PORT_ARRAY <<< "$PREFILL_PORTS"
+    IFS=',' read -ra DECODE_PORT_ARRAY <<< "$DECODE_PORTS"
+
+    # =============================================================================
+    # Launch Prefill Servers (X Producers)
+    # =============================================================================
+    echo ""
+    echo "Starting ${#PREFILL_GPU_ARRAY[@]} prefill server(s)..."
+    for i in "${!PREFILL_GPU_ARRAY[@]}"; do
+        local gpu_id=${PREFILL_GPU_ARRAY[$i]}
+        local port=${PREFILL_PORT_ARRAY[$i]}
+        local kv_port=$((21001 + i))
+        
+        echo "  Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
+        CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \
+        --enforce-eager \
+        --host 0.0.0.0 \
+        --port $port \
+        --tensor-parallel-size 1 \
+        --seed 1024 \
+        --dtype float16 \
+        --max-model-len 10000 \
+        --max-num-batched-tokens 10000 \
+        --max-num-seqs 256 \
+        --trust-remote-code \
+        --gpu-memory-utilization 0.9 \
+        --disable-log-request \
+        --kv-transfer-config \
+        "{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_producer\",\"kv_buffer_size\":\"1e1\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > prefill$((i+1)).log 2>&1 &
+        PIDS+=($!)
+    done
+
+    # =============================================================================
+    # Launch Decode Servers (Y Decoders)
+    # =============================================================================
+    echo ""
+    echo "Starting ${#DECODE_GPU_ARRAY[@]} decode server(s)..."
+    for i in "${!DECODE_GPU_ARRAY[@]}"; do
+        local gpu_id=${DECODE_GPU_ARRAY[$i]}
+        local port=${DECODE_PORT_ARRAY[$i]}
+        local kv_port=$((22001 + i))
+        
+        echo "  Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
+        VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
+        --enforce-eager \
+        --host 0.0.0.0 \
+        --port $port \
+        --tensor-parallel-size 1 \
+        --seed 1024 \
+        --dtype float16 \
+        --max-model-len 10000 \
+        --max-num-batched-tokens 10000 \
+        --max-num-seqs 256 \
+        --trust-remote-code \
+        --gpu-memory-utilization 0.7 \
+        --disable-log-request \
+        --kv-transfer-config \
+        "{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_consumer\",\"kv_buffer_size\":\"8e9\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > decode$((i+1)).log 2>&1 &
+        PIDS+=($!)
+    done
+
+    # =============================================================================
+    # Wait for All Servers to Start
+    # =============================================================================
+    echo ""
+    echo "Waiting for all servers to start..."
+    for port in "${PREFILL_PORT_ARRAY[@]}" "${DECODE_PORT_ARRAY[@]}"; do
+        if ! wait_for_server $port; then
+            echo "Failed to start server on port $port"
+            cleanup
+            exit 1
+        fi
+    done
+
+    echo ""
+    echo "All servers are up. Starting benchmark..."
+
+    # =============================================================================
+    # Run Benchmark
+    # =============================================================================
+    cd ../../../benchmarks/
+    python3 benchmark_serving.py --port 10001 --seed $(date +%s) \
+        --model $MODEL \
+        --dataset-name random --random-input-len 7500 --random-output-len 200 \
+        --num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
+
+    echo "Benchmarking done. Cleaning up..."
+
+    cleanup
+}
+
+main
\ No newline at end of file
diff --git a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
similarity index 100%
rename from examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py
rename to examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py

From a2f14dc8f9bb04bd782d1aa4d2e6364841d63d6c Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Mon, 30 Jun 2025 23:17:07 -0500
Subject: [PATCH 003/167] [CI][Intel Gaudi][vllm-Plugin]Add CI for
 hpu-plugin-v1-test (#20196)

Signed-off-by: Chendi Xue <chendi.xue@intel.com>
---
 .../scripts/hardware_ci/run-hpu-test.sh       | 48 +++++++++++++++----
 1 file changed, 40 insertions(+), 8 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-hpu-test.sh b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
index 5efac3ddf469f..ae5b35a9ac6bd 100644
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@@ -2,10 +2,34 @@
 
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
-set -ex
+set -exuo pipefail
 
 # Try building the docker image
-docker build -t hpu-test-env -f docker/Dockerfile.hpu .
+cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
+FROM 1.22-413-pt2.7.1:latest
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+
+RUN pip install -v -r requirements/hpu.txt
+RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
+
+ENV no_proxy=localhost,127.0.0.1
+ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+
+RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
+
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
+WORKDIR /workspace/
+
+RUN git clone https://github.com/vllm-project/vllm-gaudi.git
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+EOF
 
 # Setup cleanup
 # certain versions of HPU software stack have a bug that can
@@ -14,13 +38,21 @@ docker build -t hpu-test-env -f docker/Dockerfile.hpu .
 # functions, while other platforms only need one remove_docker_container
 # function.
 EXITCODE=1
-remove_docker_containers() { docker rm -f hpu-test || true; docker rm -f hpu-test-tp2 || true; }
-remove_docker_containers_and_exit() { remove_docker_containers; exit $EXITCODE; }
-trap remove_docker_containers_and_exit EXIT
+remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
+trap 'remove_docker_containers; exit $EXITCODE;' EXIT
 remove_docker_containers
 
-# Run the image and launch offline inference
-docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
-docker run --runtime=habana --name=hpu-test-tp2 --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --tensor-parallel-size 2
+echo "Running HPU plugin v1 test"
+docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
+  -e HABANA_VISIBLE_DEVICES=all \
+  hpu-plugin-v1-test-env \
+  /bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
 
 EXITCODE=$?
+if [ $EXITCODE -eq 0 ]; then
+  echo "Test with basic model passed"
+else
+  echo "Test with basic model FAILED with exit code: $EXITCODE" >&2
+fi
+
+# The trap will handle the container removal and final exit.
\ No newline at end of file

From bd5038af076a2e299d4781c3885415639a1ed3a5 Mon Sep 17 00:00:00 2001
From: Ernest Wong <chwong719@gmail.com>
Date: Mon, 30 Jun 2025 21:44:39 -0700
Subject: [PATCH 004/167] [Doc] add config and troubleshooting guide for NCCL &
 GPUDirect RDMA (#15897)

Signed-off-by: Ernest Wong <chwong719@gmail.com>
---
 docs/serving/distributed_serving.md | 45 ++++++++++++++++++++++++++++-
 docs/usage/troubleshooting.md       | 21 ++++++++++++++
 2 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/docs/serving/distributed_serving.md b/docs/serving/distributed_serving.md
index 38dcb8c81caf7..6665955411ad5 100644
--- a/docs/serving/distributed_serving.md
+++ b/docs/serving/distributed_serving.md
@@ -100,7 +100,50 @@ vllm serve /path/to/the/model/in/the/container \
      --tensor-parallel-size 16
 ```
 
-To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient.
+To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like InfiniBand. To correctly set up the cluster to use InfiniBand, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the InfiniBand is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses InfiniBand with GPUDirect RDMA, which is efficient.
+
+### GPUDirect RDMA
+
+To enable GPUDirect RDMA with vLLM, specific configuration tweaks are needed. This setup ensures:
+
+- `IPC_LOCK` Security Context: Add the `IPC_LOCK` capability to the container’s security context to lock memory pages and prevent swapping to disk.
+- Shared Memory with `/dev/shm`: Mount `/dev/shm` in the pod spec to provide shared memory for IPC.
+
+When using Docker, you can set up the container as follows:
+
+```bash
+docker run --gpus all \
+    --ipc=host \
+    --shm-size=16G \
+    -v /dev/shm:/dev/shm \
+    vllm/vllm-openai
+```
+
+When using Kubernetes, you can set up the pod spec as follows:
+
+```yaml
+...
+spec:
+  containers:
+    - name: vllm
+      image: vllm/vllm-openai
+      securityContext:
+        capabilities:
+          add: ["IPC_LOCK"]
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        limits:
+          nvidia.com/gpu: 8
+        requests:
+          nvidia.com/gpu: 8
+  volumes:
+    - name: dshm
+      emptyDir:
+        medium: Memory
+...
+```
 
 !!! warning
     After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script][troubleshooting-incorrect-hardware-driver] for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See <gh-issue:6803> for more information.
diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md
index 82957d33b19e0..7f1f76ce3d2e3 100644
--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@@ -273,6 +273,27 @@ But you are sure that the model is in the [list of supported models][supported-m
 
 If you see an error like `RuntimeError: Failed to infer device type`, it means that vLLM failed to infer the device type of the runtime environment. You can check [the code](gh-file:vllm/platforms/__init__.py) to see how vLLM infers the device type and why it is not working as expected. After [this PR](gh-pr:14195), you can also set the environment variable `VLLM_LOGGING_LEVEL=DEBUG` to see more detailed logs to help debug the issue.
 
+## NCCL error: unhandled system error during `ncclCommInitRank`
+
+If your serving workload uses GPUDirect RDMA for distributed serving across multiple nodes and encounters an error during `ncclCommInitRank`, with no clear error message even with `NCCL_DEBUG=INFO` set, it might look like this:
+
+```text
+Error executing method 'init_device'. This might cause deadlock in distributed execution.
+Traceback (most recent call last):
+...
+   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/device_communicators/pynccl.py", line 99, in __init__
+     self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
+                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/device_communicators/pynccl_wrapper.py", line 277, in ncclCommInitRank
+     self.NCCL_CHECK(self._funcs["ncclCommInitRank"](ctypes.byref(comm),
+   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/device_communicators/pynccl_wrapper.py", line 256, in NCCL_CHECK
+     raise RuntimeError(f"NCCL error: {error_str}")
+ RuntimeError: NCCL error: unhandled system error (run with NCCL_DEBUG=INFO for details)
+...
+```
+
+This indicates vLLM failed to initialize the NCCL communicator, possibly due to a missing `IPC_LOCK` linux capability  or an unmounted `/dev/shm`. Refer to [Distributed Inference and Serving](../serving/distributed_serving.md#running-vllm-on-multiple-nodes) for guidance on properly configuring the environment for distributed serving.
+
 ## Known Issues
 
 - In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759).

From 27949354faa06035645aa908cc73922500a80b17 Mon Sep 17 00:00:00 2001
From: Alex Kogan <82225080+sakogan@users.noreply.github.com>
Date: Tue, 1 Jul 2025 01:44:38 -0400
Subject: [PATCH 005/167] [Feature] A calibration-free RTN-based quantization
 for accurate and accelerated INT4/INT8 inference (#18768)

Signed-off-by: Alex Kogan <alex.kogan@oracle.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 tests/quantization/test_rtn.py                |  28 ++
 .../layers/quantization/__init__.py           |   3 +
 .../model_executor/layers/quantization/rtn.py | 288 ++++++++++++++++++
 3 files changed, 319 insertions(+)
 create mode 100644 tests/quantization/test_rtn.py
 create mode 100644 vllm/model_executor/layers/quantization/rtn.py

diff --git a/tests/quantization/test_rtn.py b/tests/quantization/test_rtn.py
new file mode 100644
index 0000000000000..04c1f98a709e2
--- /dev/null
+++ b/tests/quantization/test_rtn.py
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright © 2025, Oracle and/or its affiliates.
+"""Tests RTN quantization startup and generation, 
+doesn't test correctness
+"""
+import pytest
+
+from tests.quantization.utils import is_quant_method_supported
+
+MODELS = ["microsoft/Phi-3-mini-4k-instruct"]
+
+
+@pytest.mark.skipif(not is_quant_method_supported("rtn"),
+                    reason="RTN is not supported on this GPU type.")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [10])
+def test_model_rtn_startup(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+
+    with vllm_runner(model, dtype=dtype, quantization="rtn") as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 1cb23e7a18875..60217ee86ad1d 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -35,6 +35,7 @@ QuantizationMethods = Literal[
     "moe_wna16",
     "torchao",
     "auto-round",
+    "rtn",
 ]
 QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))
 
@@ -110,6 +111,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
     from .neuron_quant import NeuronQuantConfig
     from .ptpc_fp8 import PTPCFp8Config
     from .qqq import QQQConfig
+    from .rtn import RTNConfig
     from .torchao import TorchAOConfig
     from .tpu_int8 import Int8TpuConfig
 
@@ -142,6 +144,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
         "moe_wna16": MoeWNA16Config,
         "torchao": TorchAOConfig,
         "auto-round": AutoRoundConfig,
+        "rtn": RTNConfig
     }
     # Update the `method_to_config` with customized quantization methods.
     method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
new file mode 100644
index 0000000000000..7e7fd6d51fd32
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/rtn.py
@@ -0,0 +1,288 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright © 2025, Oracle and/or its affiliates.
+
+import os
+from typing import Any, Optional
+
+import torch
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               set_weight_attrs)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+
+logger = init_logger(__name__)
+"""By default, use 8 bit as target precision, but it can be 
+overridden by setting the RTN_NUM_BITS envvar
+"""
+NUM_BITS = os.getenv('RTN_NUM_BITS', "8")
+"""By default, use group size of 128 parameters, but it can be 
+overridden by setting the RTN_GROUP_SIZE envvar
+"""
+GROUP_SIZE = os.getenv('RTN_GROUP_SIZE', "128")
+
+
+class RTNConfig(QuantizationConfig):
+    """Config class for RTN.
+    """
+
+    def __init__(
+            self,
+            weight_bits: int = int(NUM_BITS),
+            group_size: int = int(GROUP_SIZE),
+    ) -> None:
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+
+        if self.weight_bits != 4 and self.weight_bits != 8:
+            raise ValueError(
+                "Currently, only 4-bit or 8-bit weight quantization is "
+                f"supported for RTN, but got {self.weight_bits} bits.")
+
+    def __repr__(self) -> str:
+        return (f"RTNConfig(weight_bits={self.weight_bits}, "
+                f"group_size={self.group_size})")
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "rtn"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "RTNConfig":
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        return cls(weight_bits, group_size)
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["RTNLinearMethod"]:
+        if isinstance(layer, LinearBase):
+            return RTNLinearMethod(self)
+        return None
+
+
+class RTNTensor:
+    """A wrapper over Tensor that enables quantization on-the-fly by
+    overloading the copy_ method.
+    """
+
+    def __init__(self, data: torch.Tensor, scale: torch.Tensor,
+                 quant_config: RTNConfig) -> None:
+        self.data = data
+        self.scale = scale
+        self.quant_config = quant_config
+
+    def narrow(self, dim, start, length):
+        factor = 1 if self.quant_config.weight_bits == 8 else 2
+        return RTNTensor(
+            self.data.narrow(dim, start // factor, length // factor),
+            self.scale.narrow(dim, start, length), self.quant_config)
+
+    @property
+    def shape(self):
+        shape = self.data.shape
+        factor = 1 if self.quant_config.weight_bits == 8 else 2
+        return torch.Size((shape[0] * factor, shape[1]))
+
+    def copy_(self, loaded_weight: torch.Tensor) -> None:
+        qweight, weight_scale = rtn_quantize(loaded_weight.cuda(),
+                                             self.quant_config.weight_bits,
+                                             self.quant_config.group_size)
+
+        self.data.copy_(qweight)
+        self.scale.data.copy_(weight_scale)
+
+
+class RTNParameter(Parameter):
+    """A wrapper over Parameter that returns RTNTensor (a wrapper over Tensor)
+    when its data is accessed. We need this wrapper for the data loading phase
+    only, so we can intercept a weight copying function (torch.Tensor.copy_)
+    and apply quantization on-the-fly.
+    """
+
+    def __new__(cls, data: torch.Tensor, **kwargs):
+        return super().__new__(cls, data=data, requires_grad=False)
+
+    def __init__(self, data: torch.Tensor, scale: torch.Tensor,
+                 quant_config: RTNConfig) -> None:
+        self.scale = scale
+        self.quant_config = quant_config
+
+    @property
+    def data(self):
+        return RTNTensor(super().data, self.scale, self.quant_config)
+
+
+class RTNLinearMethod(LinearMethodBase):
+    """Linear method for RTN.
+
+    Args:
+        quant_config: The RTN quantization config.
+    """
+
+    def __init__(self, quant_config: RTNConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        output_size_per_partition = sum(output_partition_sizes)
+        num_groups_per_col = (input_size_per_partition //
+                              self.quant_config.group_size
+                              if self.quant_config.group_size != -1 else 1)
+
+        scale = Parameter(
+            torch.empty(output_size_per_partition,
+                        num_groups_per_col,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        factor = 1 if self.quant_config.weight_bits == 8 else 2
+
+        weight = RTNParameter(data=torch.empty(output_size_per_partition //
+                                               factor,
+                                               input_size_per_partition,
+                                               dtype=torch.int8),
+                              scale=scale,
+                              quant_config=self.quant_config)
+
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, {
+            **extra_weight_attrs,
+            "input_dim": 1,
+            "output_dim": 0,
+        })
+
+        layer.register_parameter("scale", scale)
+        layer.output_size_per_partition = output_size_per_partition
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        """torch.compile does not know how to deal with a Parameter subclass
+        (aka RTNParameter). As we don't really need RTNParameters for the
+        forward pass, we replace them with equivalent instances of Parameters.
+        """
+        old_weight = layer.weight
+        assert isinstance(old_weight, RTNParameter)
+        data = old_weight.data.data
+
+        delattr(layer, "weight")
+
+        new_weight = Parameter(data=data, requires_grad=False)
+        layer.register_parameter("weight", new_weight)
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        qweight = layer.weight
+        scale = layer.scale
+
+        weight = rtn_dequantize(qweight, scale)
+        out = F.linear(x, weight)
+        del weight
+        if bias is not None:
+            out.add_(bias)
+
+        return out
+
+
+def rtn_quantize(tensor: torch.Tensor, num_bits: int,
+                 group_size: int) -> tuple[torch.Tensor, torch.Tensor]:
+    """Quantize a tensor using per-group static scaling factor.
+
+    Args:
+        tensor: The input tensor.
+        num_bits: Target precision for the result (supported values are
+                  8 or 4).
+        group_size: Quantization granularity. 
+                    If equal to -1, each row in the input tensor is treated
+                    as one group.
+    """
+
+    q_range = 2**num_bits
+    num_groups = (tensor.shape[0] * tensor.shape[1] //
+                  group_size if group_size != -1 else tensor.shape[0])
+    """Calculate a scaling factor per input group.
+    """
+    input_flat = tensor.reshape(num_groups, -1)
+    input_min = torch.min(input_flat, dim=1, keepdim=True)[0]
+    input_max = torch.max(input_flat, dim=1, keepdim=True)[0]
+    input_max_abs = torch.max(input_min.abs(), input_max.abs())
+    scale = (input_max_abs * 2.0 / (q_range - 1))
+    """Scale each input group, truncate and round to the nearest integer.
+    """
+    scaled_input = input_flat / scale
+    scaled_input = scaled_input.clamp(-q_range // 2, q_range // 2 - 1)
+    scaled_input = scaled_input.round()
+
+    scale = scale.reshape(tensor.shape[0], -1).contiguous()
+    inputs_q = scaled_input.reshape(tensor.shape).to(torch.int8)
+    inputs_q = inputs_q.contiguous()
+
+    if num_bits == 4:
+        """Pack two 4-bit values into each byte.
+        """
+        inputs_q = (inputs_q[:, 1::2] << 4) | (inputs_q[:, ::2] & 0xf)
+        inputs_q = inputs_q.reshape(tensor.shape[0] // 2, tensor.shape[1])
+        inputs_q = inputs_q.contiguous()
+
+    return inputs_q, scale
+
+
+def rtn_dequantize(tensor: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+    """Dequantize a tensor using per-group static scaling factors.
+
+    Args:
+        tensor: The input tensor.
+        scale: The tensor with per-group scale factors.
+    """
+
+    num_groups = scale.size(0) * scale.size(1)
+    input_dim, output_dim = tensor.shape
+
+    num_bits = 8 if input_dim == scale.size(0) else 4
+    if num_bits == 4:
+        input_dim *= 2
+
+    data = torch.empty((input_dim, output_dim),
+                       dtype=scale.dtype,
+                       device=tensor.device)
+
+    if num_bits == 8:
+        data.copy_(tensor)
+    else:
+        """Unpack two 4-bit values from each byte.
+        """
+        tensor = tensor.reshape(input_dim, output_dim // 2)
+        for i in range(2):
+            data[:, i::2] = (tensor << 4 * (1 - i)) >> 4
+    """Scale each input group with its scaling factor.
+    """
+    scale = scale.reshape(num_groups, -1)
+    data = data.reshape(num_groups, -1)
+    data = torch.mul(data, scale)
+
+    input_deq = data.reshape((input_dim, output_dim)).contiguous()
+    return input_deq

From be250bbc67973766e546e0e3d8abb21e5caa2b1f Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 1 Jul 2025 15:02:09 +0900
Subject: [PATCH 006/167] [V1] Only print cudagraph tqdm on rank 0 with
 `is_global_first_rank` (#19516)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/distributed/parallel_state.py | 31 ++++++++++++++++++++++++++++++
 vllm/v1/worker/gpu_model_runner.py | 11 +++++++----
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 50dbbf50e9fcf..c53601a22f215 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1315,6 +1315,37 @@ def in_the_same_node_as(pg: Union[ProcessGroup, StatelessProcessGroup],
     return [x == 1 for x in aggregated_data.tolist()]
 
 
+def is_global_first_rank() -> bool:
+    """
+    Check if the current process is the first rank globally across all 
+    parallelism strategies (PP, TP, DP, EP, etc.).
+    
+    Unlike group-specific checks like `get_tensor_model_parallel_rank() == 0`
+    or `get_pp_group().is_first_rank`, this function checks the global rank
+    across all parallelism dimensions.
+    
+    Returns:
+        bool: True if this is the global first rank (rank 0), False otherwise.
+              Returns True if distributed is not initialized (single process).
+    """
+    try:
+        # If world group is available, use it for the most accurate check
+        global _WORLD
+        if _WORLD is not None:
+            return _WORLD.is_first_rank
+
+        # If torch distributed is not initialized, assume single process
+        if not torch.distributed.is_initialized():
+            return True
+
+        # Fallback to torch's global rank
+        return torch.distributed.get_rank() == 0
+
+    except Exception:
+        # If anything goes wrong, assume this is the first rank
+        return True
+
+
 def _node_count(pg: Union[ProcessGroup, StatelessProcessGroup]) -> int:
     """
     Returns the total number of nodes in the process group.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 29d39de212f88..5bdaf4b969e70 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -26,7 +26,7 @@ from vllm.distributed.kv_transfer import (get_kv_transfer_group,
                                           has_kv_transfer_group)
 from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
 from vllm.distributed.parallel_state import (
-    get_pp_group, get_tp_group, graph_capture,
+    get_pp_group, get_tp_group, graph_capture, is_global_first_rank,
     prepare_communication_buffer_for_model)
 from vllm.forward_context import (DPMetadata, get_forward_context,
                                   set_forward_context)
@@ -2285,9 +2285,12 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         # can reuse the memory pool allocated for the large shapes.
         with graph_capture(device=self.device):
             full_cg = self.full_cuda_graph
-            for num_tokens in tqdm(reversed(self.cudagraph_batch_sizes),
-                                   desc="Capturing CUDA graphs",
-                                   total=len(self.cudagraph_batch_sizes)):
+            # Only rank 0 should print progress bar during capture
+            compilation_cases = reversed(self.cudagraph_batch_sizes)
+            if is_global_first_rank():
+                compilation_cases = tqdm(list(compilation_cases),
+                                         desc="Capturing CUDA graph shapes")
+            for num_tokens in compilation_cases:
                 # We skip EPLB here since we don't want to record dummy metrics
                 for _ in range(
                         self.compilation_config.cudagraph_num_of_warmups):

From 86debab54c046232014b108d530a8c25d857e9a3 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Tue, 1 Jul 2025 00:48:10 -0600
Subject: [PATCH 007/167] Fix `numel()` downcast in
 vllm/csrc/moe/moe_align_sum_kernels.cu +2 (#17082)

Co-authored-by: mgoin <mgoin64@gmail.com>
---
 csrc/moe/moe_align_sum_kernels.cu | 2 +-
 csrc/moe/topk_softmax_kernels.cu  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index 9335e2333b0d9..462dbd1f8b380 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -239,7 +239,7 @@ void moe_sum(torch::Tensor& input,   // [num_tokens, topk, hidden_size]
              torch::Tensor& output)  // [num_tokens, hidden_size]
 {
   const int hidden_size = input.size(-1);
-  const int num_tokens = output.numel() / hidden_size;
+  const auto num_tokens = output.numel() / hidden_size;
   const int topk = input.size(1);
 
   dim3 grid(num_tokens);
diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu
index dea5b1f21ec27..064b76c9cd427 100644
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -492,7 +492,7 @@ void topk_softmax(
     torch::Tensor& gating_output)               // [num_tokens, num_experts]
 {
     const int num_experts = gating_output.size(-1);
-    const int num_tokens = gating_output.numel() / num_experts;
+    const auto num_tokens = gating_output.numel() / num_experts;
     const int topk = topk_weights.size(-1);
 
     const bool is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);

From 22e9d42040f3ecf83da181cfd84ab4cea000c4af Mon Sep 17 00:00:00 2001
From: Prashant Gupta <prashantgupta@us.ibm.com>
Date: Tue, 1 Jul 2025 00:02:20 -0700
Subject: [PATCH 008/167] [Misc] add xgrammar for arm64 (#18359)

Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com>
---
 requirements/common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 6cc304e5b1f6d..97a35e05d38ab 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -23,7 +23,7 @@ lm-format-enforcer >= 0.10.11, < 0.11
 llguidance >= 0.7.11, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
 outlines == 0.1.11
 lark == 1.2.2
-xgrammar == 0.1.19; platform_machine == "x86_64" or platform_machine == "aarch64"
+xgrammar == 0.1.19; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs

From 9909726d2a30d834d97efd7bf1c4fc0e52fa48b5 Mon Sep 17 00:00:00 2001
From: czhu-cohere <conway.zhu@cohere.com>
Date: Tue, 1 Jul 2025 00:12:20 -0700
Subject: [PATCH 009/167] Enable ZP Support for Machete (#20268)

Signed-off-by: czhu-cohere <conway.zhu@cohere.com>
---
 benchmarks/kernels/benchmark_machete.py       |  2 ++
 tests/kernels/quantization/test_machete_mm.py |  2 +-
 .../kernels/mixed_precision/machete.py        | 20 +++++++++++++++----
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index 0f896f187ecb9..f73d0511e01fc 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -234,8 +234,10 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
 
         fn = lambda: ops.gptq_marlin_gemm(
             a=bt.a,
+            c=None,
             b_q_weight=w_q,
             b_scales=w_s,
+            global_scale=None,
             b_zeros=w_zp,
             g_idx=g_idx,
             perm=sort_indices,
diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py
index 998171baaf2de..a4fb9874c4906 100644
--- a/tests/kernels/quantization/test_machete_mm.py
+++ b/tests/kernels/quantization/test_machete_mm.py
@@ -139,7 +139,7 @@ def maybe_convert_zeropoints(zps: Optional[torch.Tensor], s: torch.Tensor):
 
 def group_size_valid(shape: tuple[int, int, int],
                      group_size: Optional[int]) -> bool:
-    return group_size is None or group_size == -1 or group_size % shape[2] == 0
+    return group_size is None or group_size == -1 or shape[2] % group_size == 0
 
 
 def machete_quantize_and_pack(atype: torch.dtype,
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
index c7c45861875af..a75f3ac8d5033 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
@@ -33,8 +33,6 @@ class MacheteLinearKernel(MPLinearKernel):
             return False, "Act reordering currently not supported by Machete, "\
                           "when the input features are partitioned across "\
                           "devices"
-        if c.zero_points:
-            return False, "Zero points currently not supported by Machete"
 
         if c.weight_type not in query_machete_supported_quant_types(
                 c.zero_points):
@@ -53,6 +51,7 @@ class MacheteLinearKernel(MPLinearKernel):
     # note assumes that
     #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
     #  `weight_scale`  is: {input_dim = 0, output_dim = 1}
+    #  `weight_zp`     is: {input_dim = 0, output_dim = 1, packed_dim = 1}
     def process_weights_after_loading(self, layer: torch.nn.Module):
         c = self.config
 
@@ -90,16 +89,29 @@ class MacheteLinearKernel(MPLinearKernel):
             x.data = x.data.contiguous()
             return x
 
+        def transform_w_zp(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=1)
+            x_unpacked = unpack_quantized_values_into_int32(x.data,
+                                                            c.weight_type,
+                                                            packed_dim=1)
+            w_s = getattr(layer, self.w_s_name).data
+            # pre-apply scales to zero-points
+            x.data = (-1.0 * w_s * (x_unpacked.to(w_s.dtype))).contiguous()
+            return x
+
         # Repack weights and scales for Machete
         self._transform_param(layer, self.w_q_name, transform_w_q)
         self._transform_param(layer, self.w_s_name, transform_w_s)
+        if c.zero_points:
+            self._transform_param(layer, self.w_zp_name, transform_w_zp)
 
     def apply_weights(self,
                       layer: torch.nn.Module,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         c = self.config
-        w_q, w_s, _, _ = self._get_weight_params(layer)
+        w_q, w_s, w_zp, _ = self._get_weight_params(layer)
 
         x_2d = x.reshape(-1, x.shape[-1])
         out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
@@ -110,7 +122,7 @@ class MacheteLinearKernel(MPLinearKernel):
         output = ops.machete_mm(a=x_2d,
                                 b_q=w_q,
                                 b_type=c.weight_type,
-                                b_group_zeros=None,
+                                b_group_zeros=w_zp,
                                 b_group_scales=w_s,
                                 b_group_size=c.group_size)
 

From 6cc1e7d96dab6b9c344ec87dec6dc9ab07ad5d21 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Tue, 1 Jul 2025 15:25:03 +0800
Subject: [PATCH 010/167] [CPU] Update custom ops for the CPU backend (#20255)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .../scripts/hardware_ci/run-cpu-test.sh       |    3 +-
 cmake/cpu_extension.cmake                     |   20 +
 csrc/cpu/sgl-kernels/common.h                 |  238 +++
 csrc/cpu/sgl-kernels/gemm.cpp                 |  464 ++++++
 csrc/cpu/sgl-kernels/gemm.h                   |  266 ++++
 csrc/cpu/sgl-kernels/gemm_fp8.cpp             |  530 +++++++
 csrc/cpu/sgl-kernels/gemm_int8.cpp            |  440 ++++++
 csrc/cpu/sgl-kernels/moe.cpp                  | 1330 +++++++++++++++++
 csrc/cpu/sgl-kernels/moe_fp8.cpp              |  502 +++++++
 csrc/cpu/sgl-kernels/moe_int8.cpp             |  769 ++++++++++
 csrc/cpu/sgl-kernels/vec.h                    |  308 ++++
 csrc/cpu/shm.cpp                              |  178 +--
 csrc/cpu/torch_bindings.cpp                   |   43 +
 docs/getting_started/installation/cpu.md      |    1 +
 .../models/language/generation/test_common.py |    3 +-
 vllm/_custom_ops.py                           |   49 +
 vllm/envs.py                                  |    5 +
 .../layers/fused_moe/cpu_fused_moe.py         |  214 +++
 vllm/model_executor/layers/fused_moe/layer.py |   41 +-
 vllm/model_executor/layers/linear.py          |   25 +-
 vllm/model_executor/layers/utils.py           |   25 +-
 .../layers/vocab_parallel_embedding.py        |    2 +-
 vllm/platforms/cpu.py                         |    2 +
 23 files changed, 5357 insertions(+), 101 deletions(-)
 create mode 100644 csrc/cpu/sgl-kernels/common.h
 create mode 100644 csrc/cpu/sgl-kernels/gemm.cpp
 create mode 100644 csrc/cpu/sgl-kernels/gemm.h
 create mode 100644 csrc/cpu/sgl-kernels/gemm_fp8.cpp
 create mode 100644 csrc/cpu/sgl-kernels/gemm_int8.cpp
 create mode 100644 csrc/cpu/sgl-kernels/moe.cpp
 create mode 100644 csrc/cpu/sgl-kernels/moe_fp8.cpp
 create mode 100644 csrc/cpu/sgl-kernels/moe_int8.cpp
 create mode 100644 csrc/cpu/sgl-kernels/vec.h
 create mode 100644 vllm/model_executor/layers/fused_moe/cpu_fused_moe.py

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 8db8c3a05fb30..42506730e868c 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -51,6 +51,7 @@ function cpu_tests() {
     pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
     pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
     pytest -v -s tests/models/language/generation -m cpu_model
+    VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model
     pytest -v -s tests/models/language/pooling -m cpu_model
     pytest -v -s tests/models/multimodal/generation \
                 --ignore=tests/models/multimodal/generation/test_mllama.py \
@@ -98,4 +99,4 @@ function cpu_tests() {
 
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 1.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 5cd2c98f23438..264c970ef784a 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -96,12 +96,21 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
         if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
             CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
             list(APPEND CXX_COMPILE_FLAGS "-mavx512bf16")
+            set(ENABLE_AVX512BF16 ON)
         else()
+            set(ENABLE_AVX512BF16 OFF)
             message(WARNING "Disable AVX512-BF16 ISA support, requires gcc/g++ >= 12.3")
         endif()
     else()
+        set(ENABLE_AVX512BF16 OFF)
         message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
     endif()
+
+    find_isa(${CPUINFO} "avx512_vnni" AVX512VNNI_FOUND)
+    if (AVX512VNNI_FOUND)
+        list(APPEND CXX_COMPILE_FLAGS "-mavx512vnni")
+        set(ENABLE_AVX512VNNI ON) 
+    endif() 
     
 elseif (AVX2_FOUND)
     list(APPEND CXX_COMPILE_FLAGS "-mavx2")
@@ -231,6 +240,17 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
         "csrc/cpu/quant.cpp"
         "csrc/cpu/shm.cpp"
         ${VLLM_EXT_SRC})
+    if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
+        set(VLLM_EXT_SRC
+            "csrc/cpu/sgl-kernels/gemm.cpp"
+            "csrc/cpu/sgl-kernels/gemm_int8.cpp"
+            "csrc/cpu/sgl-kernels/gemm_fp8.cpp"
+            "csrc/cpu/sgl-kernels/moe.cpp"
+            "csrc/cpu/sgl-kernels/moe_int8.cpp"
+            "csrc/cpu/sgl-kernels/moe_fp8.cpp"
+            ${VLLM_EXT_SRC})
+        add_compile_definitions(-DCPU_CAPABILITY_AVX512)
+    endif()
 elseif(POWER10_FOUND)
     set(VLLM_EXT_SRC
         "csrc/cpu/quant.cpp"
diff --git a/csrc/cpu/sgl-kernels/common.h b/csrc/cpu/sgl-kernels/common.h
new file mode 100644
index 0000000000000..20261c1ef3e87
--- /dev/null
+++ b/csrc/cpu/sgl-kernels/common.h
@@ -0,0 +1,238 @@
+// Adapted from
+// https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
+
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/Parallel.h>
+#include <ATen/record_function.h>
+
+// clang-format off
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+namespace {
+
+// dispatch bool
+#define AT_DISPATCH_BOOL(BOOL_V, BOOL_NAME, ...)                                 \
+  [&] {                                                                          \
+    if (BOOL_V) {                                                                \
+      constexpr bool BOOL_NAME = true;                                           \
+      return __VA_ARGS__();                                                      \
+    } else {                                                                     \
+      constexpr bool BOOL_NAME = false;                                          \
+      return __VA_ARGS__();                                                      \
+    }                                                                            \
+  }()
+
+// dispatch: bfloat16, float16, int8_t, fp8_e4m3
+#define CPU_DISPATCH_PACKED_TYPES(TYPE, ...)                                    \
+  [&] {                                                                         \
+    switch (TYPE) {                                                             \
+      case at::ScalarType::BFloat16 : {                                         \
+        using packed_t = at::BFloat16;                                          \
+        return __VA_ARGS__();                                                   \
+      }                                                                         \
+      case at::ScalarType::Half: {                                              \
+        using packed_t = at::Half;                                              \
+        return __VA_ARGS__();                                                   \
+      }                                                                         \
+      case at::ScalarType::Char : {                                             \
+        using packed_t = int8_t;                                                \
+        return __VA_ARGS__();                                                   \
+      }                                                                         \
+      case at::ScalarType::Float8_e4m3fn : {                                    \
+        using packed_t = at::Float8_e4m3fn;                                     \
+        return __VA_ARGS__();                                                   \
+      }                                                                         \
+      default:                                                                  \
+        TORCH_CHECK(false, "Unsupported floating data type.\n");                \
+    }                                                                           \
+  }()
+
+#define UNUSED(x) (void)(x)
+
+#define CHECK_CPU(x) TORCH_CHECK(x.device().type() == at::kCPU, #x " must be a CPU tensor")
+
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_LAST_DIM_CONTIGUOUS(x) \
+  TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimention")
+
+#define CHECK_INPUT(x) \
+  CHECK_CPU(x);        \
+  CHECK_CONTIGUOUS(x)
+#define CHECK_LAST_DIM_CONTIGUOUS_INPUT(x) \
+  CHECK_CPU(x);                            \
+  CHECK_LAST_DIM_CONTIGUOUS(x)
+
+#define CHECK_DIM(d, x) TORCH_CHECK(x.dim() == d, #x " must be a " #d "D tensor")
+
+#define CHECK_EQ(a, b) TORCH_CHECK((a) == (b), "CHECK_EQ(" #a ", " #b ") failed. ", a, " vs ", b)
+
+// parallel routines
+constexpr int GRAIN_SIZE = 1024;
+
+template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+inline T div_up(T x, T y) { return (x + y - 1) / y; }
+
+template <typename T>
+inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
+#if 0
+    // onednn partition pattern
+    T& n_my = n_end;
+    if (nth <= 1 || n == 0) {
+        n_start = 0;
+        n_my = n;
+    } else {
+        T n1 = div_up(n, nth);
+        T n2 = n1 - 1;
+        T T1 = n - n2 * nth;
+        n_my = ith < T1 ? n1 : n2;
+        n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2;
+    }
+    n_end += n_start;
+#else
+    // pytorch aten partition pattern
+    T n_my = div_up(n, nth);
+    n_start = ith * n_my;
+    n_end = std::min(n_start + n_my, n);
+#endif
+}
+
+template <typename func_t>
+inline void parallel_for(int n, const func_t& f) {
+#if defined(_OPENMP)
+#pragma omp parallel
+{
+    int nth = omp_get_num_threads();
+    int ith = omp_get_thread_num();
+    int tbegin, tend;
+    balance211(n, nth, ith, tbegin, tend);
+    f(tbegin, tend);
+}
+#else
+    f(0, n);
+#endif
+}
+
+// for 1d parallel, use `actual_nth`
+// for 2d parallel, use even nths, e.g. 43->42
+int inline adjust_num_threads(int m) {
+  int actual_nth = at::get_num_threads();
+  if (m == 1) {
+    return actual_nth;
+  }
+  return std::max(1, (actual_nth >> 1) * 2);
+}
+
+template <typename func_t>
+inline void parallel_2d(int m, int n, const func_t& f) {
+
+  // make sure we have even num_threads
+  int nth = adjust_num_threads(m);
+
+  // [NOTE] thread blocking:
+  //
+  //   1) prefer square block per thread
+  //   2) use even number of CPU cores
+  //   3) use all `num_threads` cores
+  //
+  //   we have:
+  //     TM * TN = T
+  //     BM / TM = BN / TN
+  //   then:
+  //     TM = ((BM / BN) * T) ^ 0.5
+  //
+  float r = float(m) / n;
+  int nth_m = std::ceil(std::sqrt(r * nth));
+  int nth_n = 1;
+  for (; nth_m > 0; --nth_m) {
+    nth_n = nth / nth_m;
+    if (nth_m * nth_n == nth) {
+      break;
+    }
+  }
+
+#if defined(_OPENMP)
+#pragma omp parallel num_threads(nth)
+{
+  int ith = omp_get_thread_num();
+  int ith_m = ith / nth_n;
+  int ith_n = ith % nth_n;
+
+  int thread_block_m = div_up(m, nth_m);
+  int thread_block_n = div_up(n, nth_n);
+
+  int begin_m = ith_m * thread_block_m;
+  int end_m = std::min(m, begin_m + thread_block_m);
+  int begin_n = ith_n * thread_block_n;
+  int end_n = std::min(n, begin_n + thread_block_n);
+
+  f(begin_m, end_m, begin_n, end_n);
+}
+#else
+  f(0, m, 0, n);
+#endif
+}
+
+template <typename T>
+int get_cache_blocks(int BLOCK_SIZE, int K) {
+  // L2 2MB and ratio of 50%
+  const int L2_size = 2048 * 1024 >> 1;
+  return std::max(1, int(L2_size / (BLOCK_SIZE * K * sizeof(T))));
+}
+
+// data indexing for dimension collapse
+template <typename T>
+inline T data_index_init(T offset) {
+  return offset;
+}
+
+template <typename T, typename... Args>
+inline T data_index_init(T offset, T& x, const T& X, Args&&... args) {
+  offset = data_index_init(offset, std::forward<Args>(args)...);
+  x = offset % X;
+  return offset / X;
+}
+
+inline bool data_index_step() {
+  return true;
+}
+
+template <typename T, typename... Args>
+inline bool data_index_step(T& x, const T& X, Args&&... args) {
+  if (data_index_step(std::forward<Args>(args)...)) {
+    x = ((x + 1) == X) ? 0 : (x + 1);
+    return x == 0;
+  }
+  return false;
+}
+
+// forced unroll for perf critical path
+
+#if __has_attribute(always_inline)
+#define ALWAYS_INLINE __attribute__((__always_inline__)) inline
+#else
+#define ALWAYS_INLINE inline
+#endif
+
+template <int n>
+struct Unroll {
+  template <typename Func, typename... Args>
+  ALWAYS_INLINE void operator()(const Func& f, Args... args) const {
+    Unroll<n - 1>{}(f, args...);
+    f(std::integral_constant<int, n - 1>{}, args...);
+  }
+};
+
+template <>
+struct Unroll<1> {
+  template <typename Func, typename... Args>
+  ALWAYS_INLINE void operator()(const Func& f, Args... args) const {
+    f(std::integral_constant<int, 0>{}, args...);
+  }
+};
+
+} // anonymous namespace
diff --git a/csrc/cpu/sgl-kernels/gemm.cpp b/csrc/cpu/sgl-kernels/gemm.cpp
new file mode 100644
index 0000000000000..c122d07185ddb
--- /dev/null
+++ b/csrc/cpu/sgl-kernels/gemm.cpp
@@ -0,0 +1,464 @@
+// Adapted from
+// https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
+
+#include "common.h"
+#include "vec.h"
+#include "gemm.h"
+
+// clang-format off
+
+namespace {
+
+// packed   layout:
+//   quants {N, K}  int8_t
+//   comp   {N}     int32_t
+template <int BLOCK_N>
+inline void s8s8_compensation(int8_t* __restrict__ packed, int K) {
+#if defined(CPU_CAPABILITY_AVX512)
+  constexpr int COLS = BLOCK_N / 16;
+  __m512i vcomp[COLS];
+
+  for (int col = 0; col < COLS; ++col) {
+    vcomp[col] = _mm512_setzero_si512();
+  }
+
+  const int64_t offset = BLOCK_N * K;
+  const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
+  for (int k = 0; k < K / 4; ++k) {
+    for (int col = 0; col < COLS; ++col) {
+      __m512i vb = _mm512_loadu_si512((const __m512i *)(packed + k * BLOCK_N * 4 + col * 64));
+      vcomp[col] = _mm512_dpbusd_epi32(vcomp[col], off, vb);
+    }
+  }
+
+  for (int col = 0; col < COLS; ++col) {
+    _mm512_storeu_si512((__m512i *)(packed + offset + col * 64), vcomp[col]);
+  }
+#else
+  TORCH_CHECK(false, "s8s8_compensation not implemented!");
+#endif
+}
+
+// convert to vnni format
+// from [N, K] to [K/2, N, 2] for bfloat16 and float16
+template <typename packed_t>
+inline void pack_vnni(packed_t* __restrict__ packed, const packed_t* __restrict__ weight, int N, int K) {
+  const int VNNI_BLK = 2;
+  for (int n = 0; n < N; ++n) {
+    for (int k = 0; k < K / VNNI_BLK; ++k) {
+      for (int d = 0; d < VNNI_BLK; ++d) {
+        packed[k * N * VNNI_BLK + n * VNNI_BLK + d] = weight[n * K + k * VNNI_BLK + d];
+      }
+    }
+  }
+}
+
+template <>
+inline void pack_vnni<int8_t>(int8_t* __restrict__ packed, const int8_t* __restrict__ weight, int N, int K) {
+  constexpr int BLOCK_N = block_size_n();
+  TORCH_CHECK(N == BLOCK_N);
+
+  const int VNNI_BLK = 4;
+  for (int n = 0; n < N; ++n) {
+    for (int k = 0; k < K / VNNI_BLK; ++k) {
+      for (int d = 0; d < VNNI_BLK; ++d) {
+        packed[k * N * VNNI_BLK + n * VNNI_BLK + d] = weight[n * K + k * VNNI_BLK + d];
+      }
+    }
+  }
+  s8s8_compensation<BLOCK_N>(packed, K);
+}
+
+template <typename scalar_t>
+inline void copy_stub(scalar_t* __restrict__ out, const float* __restrict__ input, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+
+  int64_t d;
+  #pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec data0 = fVec::loadu(input + d);
+    fVec data1 = fVec::loadu(input + d + fVec::size());
+    bVec out_vec = convert_from_float_ext<scalar_t>(data0, data1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d]);
+  }
+}
+
+template <typename scalar_t>
+inline void copy_add_stub(scalar_t* __restrict__ out, const float* __restrict__ input, const float* __restrict__ bias, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+
+  int64_t d;
+  #pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec data0 = fVec::loadu(input + d) + fVec::loadu(bias + d);
+    fVec data1 = fVec::loadu(input + d + fVec::size()) + fVec::loadu(bias + d + fVec::size());
+    bVec out_vec = convert_from_float_ext<scalar_t>(data0, data1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] + bias[d]);
+  }
+}
+
+template <typename scalar_t, bool has_bias, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn {
+  static inline void apply(
+      const scalar_t* __restrict__ A, const scalar_t* __restrict__ B, scalar_t* __restrict__ C,
+      const float* __restrict__ bias, int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+    TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <bool has_bias, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
+  static inline void apply(
+      const at::BFloat16* __restrict__ A, const at::BFloat16* __restrict__ B, at::BFloat16* __restrict__ C,
+      const float* __restrict__ bias, int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+
+    constexpr int ROWS = BLOCK_M;
+    constexpr int COLS = BLOCK_N / 16;
+
+    // prefetch distance
+    constexpr int PREFETCH_SIZE_K = 0;
+
+    __m512bh va;
+    __m512bh vb[COLS];
+    __m512 vc[ROWS * COLS];
+
+    auto loadc = [&](auto i) {
+      constexpr int col = i % COLS;
+      if constexpr (has_bias) {
+        vc[i] = _mm512_loadu_ps(bias + col * 16);
+      } else {
+        vc[i] = _mm512_set1_ps(0.f);
+      }
+    };
+    Unroll<ROWS * COLS>{}(loadc);
+
+    const int64_t K2 = K >> 1;
+    const int64_t lda2 = lda >> 1;
+    const int64_t ldb2 = ldb; // ldb * 2 >> 1;
+    const float* a_ptr = reinterpret_cast<const float*>(A);
+    const float* b_ptr = reinterpret_cast<const float*>(B);
+
+    auto compute = [&](auto i, int64_t k) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = (__m512bh)(_mm512_set1_ps(a_ptr[row * lda2 + k]));
+      }
+      if constexpr (row == 0) {
+        vb[col] = (__m512bh)(_mm512_loadu_si512(b_ptr + k * ldb2 + col * 16));
+        if constexpr (PREFETCH_SIZE_K > 0) {
+          _mm_prefetch(b_ptr + (k + PREFETCH_SIZE_K) * ldb2 + col * 16, _MM_HINT_T0);
+        }
+      }
+      vc[i] = _mm512_dpbf16_ps(vc[i], va, vb[col]);
+    };
+    for (int64_t k = 0; k < K2; ++k) {
+      Unroll<ROWS * COLS>{}(compute, k);
+    }
+
+    auto storec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+      // for COLS = 2, 4 use 512bit store
+      // for COLS = 1, 3 use 256bit store
+      if constexpr (COLS % 2 == 0) {
+        if constexpr (col % 2 == 0) {
+          _mm512_storeu_si512(
+              reinterpret_cast<__m512i*>((C + row * ldc + col * 16)),
+              (__m512i)(_mm512_cvtne2ps_pbh(vc[row * COLS + col + 1], vc[row * COLS + col])));
+        }
+      } else {
+        _mm256_storeu_si256(
+            reinterpret_cast<__m256i*>(C + row * ldc + col * 16),
+            (__m256i)(_mm512_cvtneps_pbh(vc[i])));
+      }
+    };
+    Unroll<ROWS * COLS>{}(storec);
+  }
+};
+#endif
+
+#define LAUNCH_TINYGEMM_KERNEL_NN(MB_SIZE, NB_SIZE)                          \
+    tinygemm_kernel_nn<scalar_t, has_bias, MB_SIZE, NB_SIZE>::apply(         \
+        A + mb_start * lda, B + nb_start * 2, C + mb_start * ldc + nb_start, \
+        has_bias ? bias + nb_start : nullptr, K, lda, ldb, ldc);
+
+template <typename scalar_t, bool has_bias>
+struct brgemm {
+  static inline void apply(
+      const scalar_t* __restrict__ A, const scalar_t* __restrict__ B, scalar_t* __restrict__ C,
+      float* __restrict__ Ctmp, const float* __restrict__ bias,
+      int64_t M, int64_t N, int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+
+    constexpr int BLOCK_N = block_size_n();
+    at::native::cpublas::brgemm(
+        M, N, K, lda, ldb, BLOCK_N, /* add_C */false,
+        A, B, Ctmp);
+
+    // copy from Ctmp to C
+    for (int64_t m = 0; m < M; ++m) {
+      if constexpr (has_bias) {
+        copy_add_stub(C + m * ldc, Ctmp + m * BLOCK_N, bias, N);
+      } else {
+        copy_stub(C + m * ldc, Ctmp + m * BLOCK_N, N);
+      }
+    }
+  }
+};
+
+template <typename scalar_t, bool has_bias>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const scalar_t* __restrict__ B,
+    scalar_t* __restrict__ C,
+    float* __restrict__ Ctmp,
+    const float* __restrict__ bias,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg) {
+
+  if (brg) {
+    brgemm<scalar_t, has_bias>::apply(
+        A, B, C, Ctmp, bias,
+        M, N, K, lda, ldb, ldc);
+    return;
+  }
+
+  // pattern: 1-4-16
+  constexpr int64_t BLOCK_M = 4;
+  constexpr int64_t BLOCK_N = 64;
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+  for (int mb = 0; mb < MB; ++mb) {
+    int64_t mb_start = mb * BLOCK_M;
+    int64_t mb_size = std::min(BLOCK_M, M - mb_start);
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch(mb_size << 4 | nb_size >> 4) {
+        // mb_size = 1
+        case 0x12: LAUNCH_TINYGEMM_KERNEL_NN(1, 32); break;
+        case 0x14: LAUNCH_TINYGEMM_KERNEL_NN(1, 64); break;
+        // mb_size = 2
+        case 0x22: LAUNCH_TINYGEMM_KERNEL_NN(2, 32); break;
+        case 0x24: LAUNCH_TINYGEMM_KERNEL_NN(2, 64); break;
+        // mb_size = 3
+        case 0x32: LAUNCH_TINYGEMM_KERNEL_NN(3, 32); break;
+        case 0x34: LAUNCH_TINYGEMM_KERNEL_NN(3, 64); break;
+        // mb_size = 4
+        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
+        case 0x44: LAUNCH_TINYGEMM_KERNEL_NN(4, 64); break;
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+void weight_packed_linear_kernel_impl(
+    scalar_t* __restrict__ out,
+    const scalar_t* __restrict__ mat1,
+    const scalar_t* __restrict__ mat2,
+    const float* __restrict__ bias,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t mat1_strideM,
+    int64_t out_strideM) {
+
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  // use avx512-bf16 when a) M is small; b) dtype is bfloat16, otherwise use amx
+  const bool use_brgemm = (M > 4) || (!std::is_same_v<scalar_t, at::BFloat16>);
+
+  // l2 cache block for n
+  int64_t cache_blocks_nb = get_cache_blocks<scalar_t>(BLOCK_N, K);
+
+  // parallel on [MB, NB]
+  AT_DISPATCH_BOOL(bias != nullptr, has_bias, [&] {
+    parallel_2d(MB, NB, [&](int64_t begin_mb, int64_t end_mb, int64_t begin_nb, int64_t end_nb) {
+
+      // for brgemm, use float32 for accumulate
+      alignas(64) float Ctmp[BLOCK_M * BLOCK_N];
+
+      for (int64_t nbb = begin_nb; nbb < end_nb; nbb += cache_blocks_nb) {
+      for (int64_t mb = begin_mb; mb < end_mb; ++mb) {
+      for (int64_t nb = nbb; nb < std::min(nbb + cache_blocks_nb, end_nb); ++nb) {
+
+        int64_t mb_start = mb * BLOCK_M;
+        int64_t mb_size = std::min(M - mb_start, BLOCK_M);
+        int64_t nb_start = nb * BLOCK_N;
+        int64_t nb_size = std::min(N - nb_start, BLOCK_N);
+
+        tinygemm_kernel<scalar_t, has_bias>(
+            /*   A */ mat1 + mb_start * mat1_strideM,
+            /*   B */ mat2 + nb_start * K /* nb * BLOCK_N * K */,
+            /*   C */ out + mb_start * out_strideM + nb_start,
+            /* Ctmp*/ Ctmp,
+            /* bias*/ bias + nb_start,
+            /*   M */ mb_size,
+            /*   N */ nb_size,
+            /*   K */ K,
+            /* lda */ mat1_strideM,
+            /* ldb */ nb_size,
+            /* ldc */ out_strideM,
+            /* brg */ use_brgemm);
+      }}}
+
+      if (use_brgemm) {
+        at::native::cpublas::brgemm_release();
+      }
+    });
+  });
+}
+
+} // anonymous namespace
+
+// tinygemm interface
+template <typename scalar_t>
+void tinygemm_kernel(const scalar_t* __restrict__ A, const scalar_t* __restrict__ B, scalar_t* __restrict__ C,
+    float* __restrict__ Ctmp, int64_t M, int64_t N, int64_t K, int64_t lda, int64_t ldb, int64_t ldc, bool brg) {
+  tinygemm_kernel<scalar_t, false>(A, B, C, Ctmp, nullptr, M, N, K, lda, ldb, ldc, brg);
+}
+
+#define INSTANTIATE_TINYGEMM_TEMPLATE(TYPE)                                             \
+    template void tinygemm_kernel<TYPE>(                                                \
+        const TYPE* __restrict__ A, const TYPE* __restrict__ B, TYPE* __restrict__ C,   \
+        float* __restrict__ Ctmp, int64_t M, int64_t N, int64_t K, int64_t lda,         \
+        int64_t ldb, int64_t ldc, bool brg)
+
+INSTANTIATE_TINYGEMM_TEMPLATE(at::BFloat16);
+INSTANTIATE_TINYGEMM_TEMPLATE(at::Half);
+
+at::Tensor convert_weight_packed(at::Tensor& weight) {
+  // for 3d moe weights
+  // weight : [E, OC, IC]
+  //     w1 : [E, 2N,  K]
+  //     w2 : [E,  K,  N]
+  CHECK_INPUT(weight);
+
+  const int64_t ndim = weight.ndimension();
+  TORCH_CHECK(ndim == 2 || ndim == 3, "expect weight to be 2d or 3d, got ", ndim, "d tensor.");
+  const auto st = weight.scalar_type();
+  const int64_t E = ndim == 3 ? weight.size(0) : 1;
+  const int64_t OC = ndim == 3 ? weight.size(1) : weight.size(0);
+  const int64_t IC = ndim == 3 ? weight.size(2) : weight.size(1);
+
+  // we handle 2 TILE_N at a time.
+  TORCH_CHECK(OC % TILE_N == 0, "invalid weight out features ", OC);
+  TORCH_CHECK(IC % TILE_K == 0, "invalid weight input features ", IC);
+
+  constexpr int64_t BLOCK_N = block_size_n();
+  const int64_t NB = div_up(OC, BLOCK_N);
+
+  // use phony sizes here [E, OC, IC], for each [E], [OC, IC] -> [IC / 2, OC, 2]
+  auto packed_weight = at::empty({}, weight.options());
+  const int64_t stride = OC * IC;
+
+  TORCH_CHECK(st == at::kBFloat16 || st == at::kHalf || st == at::kChar || st == at::kFloat8_e4m3fn,
+      "expect weight to be bfloat16, float16, int8 or fp8_e4m3.");
+
+  CPU_DISPATCH_PACKED_TYPES(st, [&] {
+    // adjust most inner dimension size
+    const int packed_row_size = get_row_size<packed_t>(IC);
+    auto sizes = weight.sizes().vec();
+    sizes[ndim - 1] = packed_row_size;
+    packed_weight.resize_(sizes);
+
+    const packed_t* w_data = weight.data_ptr<packed_t>();
+    packed_t* packed_data = packed_weight.data_ptr<packed_t>();
+
+    // parallel on {E, NB}
+    at::parallel_for(0, E * NB, 0, [&](int64_t begin, int64_t end) {
+      int64_t e{0}, nb{0};
+      data_index_init(begin, e, E, nb, NB);
+
+      for (int64_t i = begin; i < end; ++i) {
+        UNUSED(i);
+
+        int64_t n = nb * BLOCK_N;
+        int64_t n_size = std::min(BLOCK_N, OC - n);
+        pack_vnni<packed_t>(
+            packed_data + e * OC * packed_row_size + n * packed_row_size,
+            w_data + e * stride + n * IC,
+            n_size,
+            IC);
+
+        // move to the next index
+        data_index_step(e, E, nb, NB);
+      }
+    });
+  });
+  return packed_weight;
+}
+
+// mat1 : [M, K]
+// mat2 : [N, K]
+// bias : [N]
+// out  : [M, N]
+//
+at::Tensor weight_packed_linear(at::Tensor& mat1, at::Tensor& mat2,
+    const std::optional<at::Tensor>& bias, bool is_vnni) {
+  RECORD_FUNCTION(
+    "sgl-kernel::weight_packed_linear", std::vector<c10::IValue>({mat1, mat2, bias}));
+
+  auto packed_w = is_vnni ? mat2 : convert_weight_packed(mat2);
+
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(mat1);
+  CHECK_INPUT(mat2);
+
+  int64_t M = mat1.size(0);
+  int64_t N = mat2.size(0);
+  int64_t K = mat2.size(1);
+  CHECK_EQ(mat1.size(1), K);
+  CHECK_DIM(2, mat1);
+  CHECK_DIM(2, mat2);
+
+  auto out = at::empty({M, N}, mat1.options());
+
+  // strides
+  int64_t mat1_strideM = mat1.stride(0);
+  int64_t out_strideM = out.stride(0);
+
+  const bool has_bias = bias.has_value();
+  const float* bias_data = nullptr;
+  if (has_bias) {
+    CHECK_EQ(bias.value().size(0), N);
+    bias_data = bias.value().data_ptr<float>();
+  }
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(mat1.scalar_type(), "weight_packed_linear_kernel_impl", [&] {
+    weight_packed_linear_kernel_impl<scalar_t>(
+        out.data_ptr<scalar_t>(),
+        mat1.data_ptr<scalar_t>(),
+        packed_w.data_ptr<scalar_t>(),
+        bias_data,
+        M,
+        N,
+        K,
+        mat1_strideM,
+        out_strideM);
+  });
+
+  return out;
+}
diff --git a/csrc/cpu/sgl-kernels/gemm.h b/csrc/cpu/sgl-kernels/gemm.h
new file mode 100644
index 0000000000000..afae19721ae96
--- /dev/null
+++ b/csrc/cpu/sgl-kernels/gemm.h
@@ -0,0 +1,266 @@
+#pragma once
+
+#include <ATen/native/CPUBlas.h>
+
+// clang-format off
+
+// amx-bf16
+#define TILE_M 16
+#define TILE_N 16
+#define TILE_K 32
+
+// block size for AMX gemm
+constexpr int block_size_m() { return 2 * TILE_M; }
+constexpr int block_size_n() { return 2 * TILE_N; }
+
+// define threshold using brgemm (intel AMX)
+template <typename T> inline bool can_use_brgemm(int M);
+template <> inline bool can_use_brgemm<at::BFloat16>(int M) { return M > 4; }
+template <> inline bool can_use_brgemm<at::Half>(int M) { return true; }
+// TODO: add u8s8 brgemm, this requires PyTorch 2.7
+template <> inline bool can_use_brgemm<int8_t>(int M) { return false; }
+template <> inline bool can_use_brgemm<at::Float8_e4m3fn>(int M) { return M > 4; }
+template <> inline bool can_use_brgemm<at::quint4x2>(int M) { return M > 4; }
+
+// work around compiler internal error
+#define BLOCK_K 128 // 4 * TILE_K
+
+// adjust leading dimension size for K
+template <typename T>
+inline int64_t get_row_size(int64_t K) {
+  return K;
+}
+
+template <>
+inline int64_t get_row_size<int8_t>(int64_t K) {
+  return K + sizeof(int32_t);
+}
+
+inline int64_t get_row_size(int64_t K, bool use_int8_w8a8) {
+  return use_int8_w8a8 ? K + sizeof(int32_t) : K;
+}
+
+// pack weight to vnni format
+at::Tensor convert_weight_packed(at::Tensor& weight);
+
+// moe implementations for int8 w8a8
+template <typename scalar_t>
+void fused_experts_int8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ ic2,
+    uint8_t* __restrict__ A_tmp,
+    float* __restrict__ C_tmp,
+    uint8_t* __restrict__ Aq_tmp,
+    float* __restrict__ As_tmp,
+    const scalar_t* __restrict__ input,
+    const int8_t* __restrict__ packed_w1,
+    const int8_t* __restrict__ packed_w2,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    const float* __restrict__ topk_weights,
+    const int32_t* __restrict__ sorted_ids,
+    const int32_t* __restrict__ expert_ids,
+    const int32_t* __restrict__ offsets,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t E,
+    int64_t topk,
+    int64_t num_tokens_post_pad);
+
+// moe implementations for fp8 w8a16
+template <typename scalar_t>
+void fused_experts_fp8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic0,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ ic2,
+    scalar_t* __restrict__ A_tmp,
+    scalar_t* __restrict__ B_tmp,
+    float* __restrict__ C_tmp,
+    const scalar_t* __restrict__ input,
+    const at::Float8_e4m3fn* __restrict__ packed_w1,
+    const at::Float8_e4m3fn* __restrict__ packed_w2,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    int64_t block_size_N,
+    int64_t block_size_K,
+    const float* __restrict__ topk_weights,
+    const int32_t* __restrict__ sorted_ids,
+    const int32_t* __restrict__ expert_ids,
+    const int32_t* __restrict__ offsets,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t E,
+    int64_t topk,
+    int64_t num_tokens_post_pad);
+
+// moe implementations for int4 w4a16
+template <typename scalar_t>
+void fused_experts_int4_w4a16_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic0,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ ic2,
+    scalar_t* __restrict__ A_tmp,
+    scalar_t* __restrict__ B_tmp,
+    float* __restrict__ C_tmp,
+    const scalar_t* __restrict__ input,
+    const at::quint4x2* __restrict__ packed_w1,
+    const at::quint4x2* __restrict__ packed_w2,
+    const uint8_t* __restrict__ w1z,
+    const uint8_t* __restrict__ w2z,
+    const scalar_t* __restrict__ w1s,
+    const scalar_t* __restrict__ w2s,
+    int group_size,
+    const float* __restrict__ topk_weights,
+    const int32_t* __restrict__ sorted_ids,
+    const int32_t* __restrict__ expert_ids,
+    const int32_t* __restrict__ offsets,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t E,
+    int64_t topk,
+    int64_t num_tokens_post_pad);
+
+// shared expert implememntation for int8 w8a8
+template <typename scalar_t>
+void shared_expert_int8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic1,
+    float* __restrict__ C_tmp,
+    uint8_t* __restrict__ Aq_tmp,
+    float* __restrict__ As_tmp,
+    const scalar_t* __restrict__ input,
+    const int8_t* __restrict__ packed_w1,
+    const int8_t* __restrict__ packed_w2,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    const scalar_t* __restrict__ fused_experts_out,
+    float routed_scaling_factor,
+    int64_t M,
+    int64_t N,
+    int64_t K);
+
+template <typename scalar_t>
+void shared_expert_fp8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic0,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ B_tmp,
+    float* __restrict__ C_tmp,
+    const scalar_t* __restrict__ input,
+    const at::Float8_e4m3fn* __restrict__ packed_w1,
+    const at::Float8_e4m3fn* __restrict__ packed_w2,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    int64_t block_size_N,
+    int64_t block_size_K,
+    const scalar_t* __restrict__ fused_experts_out,
+    float routed_scaling_factor,
+    int64_t M,
+    int64_t N,
+    int64_t K);
+
+// tinygemm interface
+template <typename scalar_t>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const scalar_t* __restrict__ B,
+    scalar_t* __restrict__ C,
+    float* __restrict__ Ctmp,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg);
+
+template <typename scalar_t>
+void tinygemm_kernel(
+    const uint8_t* __restrict__ A,
+    const int8_t* __restrict__ B,
+    scalar_t* __restrict__ C,
+    int32_t* __restrict__ Ctmp,
+    const float* __restrict__ As,
+    const float* __restrict__ Bs,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg);
+
+template <typename scalar_t>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const at::Float8_e4m3fn* __restrict__ B,
+    scalar_t* __restrict__ C,
+    scalar_t* __restrict__ Btmp,
+    float* __restrict__ Ctmp,
+    const float* __restrict__ scale,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg,
+    int64_t block_size_K);
+
+template <typename scalar_t>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const at::quint4x2* __restrict__ B,
+    scalar_t* __restrict__ C,
+    const uint8_t* __restrict__ Bz,
+    const scalar_t* __restrict__ Bs,
+    scalar_t* __restrict__ Btmp,
+    float* __restrict__ Ctmp,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int group_size,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    int64_t strideBz,
+    int64_t strideBs,
+    bool brg);
+
+// TODO: debug print, remove me later
+inline void print_16x32i(const __m512i x) {
+  int32_t a[16];
+  _mm512_storeu_si512((__m512i *)a, x);
+
+  for (int i = 0; i < 16; i++){
+    std::cout << a[i] << " ";
+  }
+  std::cout << std::endl;
+}
+
+inline void print_16x32(const __m512 x) {
+  float a[16];
+  _mm512_storeu_ps((__m512 *)a, x);
+
+  for (int i = 0; i < 16; i++){
+    std::cout << a[i] << " ";
+  }
+  std::cout << std::endl;
+}
+
+
+inline void print_32x8u(const __m256i x) {
+  uint8_t a[32];
+  _mm256_storeu_si256((__m256i *)a, x);
+
+  for (int i = 0; i < 32; ++i) {
+    std::cout << int32_t(a[i]) << " ";
+  }
+  std::cout << std::endl;
+}
diff --git a/csrc/cpu/sgl-kernels/gemm_fp8.cpp b/csrc/cpu/sgl-kernels/gemm_fp8.cpp
new file mode 100644
index 0000000000000..b5f2f07bad623
--- /dev/null
+++ b/csrc/cpu/sgl-kernels/gemm_fp8.cpp
@@ -0,0 +1,530 @@
+// Adapted from
+// https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
+
+#include "common.h"
+#include "vec.h"
+#include "gemm.h"
+
+// clang-format off
+
+// we use 4x32 for BLOCK_M
+#define BLOCK_SIZE_M_SCALE 4
+
+namespace {
+
+template <typename scalar_t>
+inline void copy_stub(scalar_t* __restrict__ out, const float* __restrict__ input, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+
+  int64_t d;
+  #pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec data0 = fVec::loadu(input + d);
+    fVec data1 = fVec::loadu(input + d + fVec::size());
+    bVec out_vec = convert_from_float_ext<scalar_t>(data0, data1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d]);
+  }
+}
+
+template <typename scalar_t>
+inline void copy_add_stub(scalar_t* __restrict__ out, const float* __restrict__ input, const float* __restrict__ bias, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+
+  int64_t d;
+  #pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec data0 = fVec::loadu(input + d) + fVec::loadu(bias + d);
+    fVec data1 = fVec::loadu(input + d + fVec::size()) + fVec::loadu(bias + d + fVec::size());
+    bVec out_vec = convert_from_float_ext<scalar_t>(data0, data1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] + bias[d]);
+  }
+}
+
+inline void unpack_B(
+    at::BFloat16* __restrict__ Btmp,
+    const at::Float8_e4m3fn* __restrict__ packed_B,
+    int N,
+    int K,
+    int ldb,
+    int ldb_tmp,
+    float scale) {
+#if defined(CPU_CAPABILITY_AVX512)
+  // [K/2, N, 2]
+  const int K2 = K >> 1;
+  const int ldb2 = ldb; // ldb * 2 >> 1;
+  const uint16_t* b_ptr = reinterpret_cast<const uint16_t*>(packed_B);
+  const __m512 vd = _mm512_set1_ps(scale);
+
+  constexpr int BLOCK_N = block_size_n();
+  static_assert(BLOCK_N == 32);
+
+  // prefetch distance
+  constexpr int PREFETCH_SIZE_K = 64;
+
+#pragma GCC unroll 4
+  for (int k = 0; k < K2; ++k) {
+    __m512i b8 = _mm512_loadu_si512(b_ptr + k * ldb2);
+    if constexpr (PREFETCH_SIZE_K > 0) {
+      _mm_prefetch(b_ptr + (k + PREFETCH_SIZE_K) * ldb2, _MM_HINT_T0);
+    }
+
+    __m256i b8_0 = _mm512_extracti32x8_epi32(b8, 0);
+    __m256i b8_1 = _mm512_extracti32x8_epi32(b8, 1);
+
+    __m512bh bf16_0 = CVT_FP8_TO_BF16(b8_0);
+    __m512bh bf16_1 = CVT_FP8_TO_BF16(b8_1);
+
+    // Apply scale
+    __m512 f0_lo = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32((__m512i)bf16_0, 0));
+    __m512 f0_hi = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32((__m512i)bf16_0, 1));
+    __m512 f1_lo = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32((__m512i)bf16_1, 0));
+    __m512 f1_hi = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32((__m512i)bf16_1, 1));
+
+    f0_lo = _mm512_mul_ps(f0_lo, vd);
+    f0_hi = _mm512_mul_ps(f0_hi, vd);
+    f1_lo = _mm512_mul_ps(f1_lo, vd);
+    f1_hi = _mm512_mul_ps(f1_hi, vd);
+
+    bf16_0 = _mm512_cvtne2ps_pbh(f0_hi, f0_lo);
+    bf16_1 = _mm512_cvtne2ps_pbh(f1_hi, f1_lo);
+
+    _mm512_storeu_si512(Btmp + k * ldb_tmp * 2 + 0, (__m512i)bf16_0);
+    _mm512_storeu_si512(Btmp + k * ldb_tmp * 2 + 32, (__m512i)bf16_1);
+  }
+#else
+  TORCH_CHECK(false, "unpack_B: scalar path not implemented!");
+#endif
+}
+
+template <typename scalar_t, typename packed_t, bool has_bias, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn {
+  static inline void apply(
+      const scalar_t* __restrict__ A, const packed_t* __restrict__ B, scalar_t* __restrict__ C,
+      const float* __restrict__ bias, const float* __restrict__ scale, int K, int lda, int ldb, int ldc, int64_t block_size_K) {
+    TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <bool has_bias, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn<at::BFloat16, at::Float8_e4m3fn, has_bias, BLOCK_M, BLOCK_N> {
+  static inline void apply(
+      const at::BFloat16* __restrict__ A, const at::Float8_e4m3fn* __restrict__ B, at::BFloat16* __restrict__ C,
+      const float* __restrict__ bias, const float* __restrict__ scale, int K, int lda, int ldb, int ldc, int64_t block_size_K) {
+
+    constexpr int ROWS = BLOCK_M;
+    constexpr int COLS = BLOCK_N / 16;
+
+    const int KB = div_up(K, BLOCK_K);
+
+    // prefetch distance
+    constexpr int PREFETCH_SIZE_K = 64;
+    constexpr int PREFETCH_SIZE_KB = 1;
+
+    __m512bh va;
+    __m512bh vb[COLS];
+    __m512 vc[ROWS * COLS];
+    __m512 vsum[ROWS * COLS];
+
+    // block quant scale
+    __m512 vscale;
+
+    auto loadc = [&](auto i) {
+      constexpr int col = i % COLS;
+      if constexpr (has_bias) {
+        vc[i] = _mm512_loadu_ps(bias + col * 16);
+      } else {
+        vc[i] = _mm512_setzero_ps();
+      }
+    };
+    Unroll<ROWS * COLS>{}(loadc);
+
+    const int lda2 = lda >> 1;
+    const int ldb2 = ldb; // ldb * 2 >> 1;
+    const float* a_ptr = reinterpret_cast<const float*>(A);
+    const uint16_t* b_ptr = reinterpret_cast<const uint16_t*>(B);
+
+    auto compute = [&](auto i, int k) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = (__m512bh)(_mm512_set1_ps(a_ptr[row * lda2 + k]));
+        if constexpr (PREFETCH_SIZE_K > 0) {
+          _mm_prefetch(a_ptr + row * lda2 + k + PREFETCH_SIZE_K, _MM_HINT_T0);
+        }
+      }
+      if constexpr (row == 0) {
+        if constexpr (col % 2 == 0) {
+          __m512i b8 = _mm512_loadu_si512(b_ptr + k * ldb2 + col * 16);
+          if constexpr (PREFETCH_SIZE_K > 0) {
+            _mm_prefetch(b_ptr + (k + PREFETCH_SIZE_K) * ldb2 + col * 16, _MM_HINT_T0);
+          }
+          vb[col + 0] = CVT_FP8_TO_BF16(_mm512_extracti32x8_epi32(b8, 0));
+          vb[col + 1] = CVT_FP8_TO_BF16(_mm512_extracti32x8_epi32(b8, 1));
+        }
+      }
+      vsum[i] = _mm512_dpbf16_ps(vsum[i], va, vb[col]);
+    };
+
+    constexpr int BLOCK_K2 = BLOCK_K >> 1;
+    for (int kb = 0; kb < KB; ++kb) {
+      int kb_start = kb * BLOCK_K2;
+      int kb_end = std::min(K, kb_start + BLOCK_K2);
+      // 1. load scale vector
+      vscale = _mm512_set1_ps(scale[kb]);
+      if constexpr (PREFETCH_SIZE_KB > 0) {
+        _mm_prefetch(scale + kb + PREFETCH_SIZE_KB, _MM_HINT_T0);
+      }
+      // 2. zero vsum for each block
+      Unroll<ROWS * COLS>{}([&](auto i) {
+        vsum[i] = _mm512_setzero_ps();
+      });
+      // 3. accumulate across each block
+      for (int k = kb_start; k < kb_end; ++k) {
+        Unroll<ROWS * COLS>{}(compute, k);
+      }
+      // 4. apply scale
+      Unroll<ROWS * COLS>{}([&](auto i) {
+        vc[i] = _mm512_fmadd_ps(vsum[i], vscale, vc[i]);
+      });
+    }
+
+    auto storec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+      // for COLS = 2,4 use 512bit store
+      if constexpr (col % 2 == 0) {
+        _mm512_storeu_si512(
+            reinterpret_cast<__m512i*>((C + row * ldc + col * 16)),
+            (__m512i)(_mm512_cvtne2ps_pbh(vc[row * COLS + col + 1], vc[row * COLS + col])));
+      }
+    };
+    Unroll<ROWS * COLS>{}(storec);
+  }
+};
+#endif
+
+#define LAUNCH_TINYGEMM_KERNEL_NN(MB_SIZE, NB_SIZE)                          \
+    tinygemm_kernel_nn<scalar_t, at::Float8_e4m3fn, has_bias, MB_SIZE, NB_SIZE>::apply(         \
+        A + mb_start * lda, B + nb_start * 2, C + mb_start * ldc + nb_start, \
+        has_bias ? bias + nb_start : nullptr, scale, K, lda, ldb, ldc, block_size_K);
+
+template <typename scalar_t, typename packed_t, bool has_bias>
+struct brgemm {
+  static inline void apply(
+      const scalar_t* __restrict__ A,
+      const packed_t* __restrict__ B,
+      scalar_t* __restrict__ C,
+      scalar_t* __restrict__ Btmp,
+      float* __restrict__ Ctmp,
+      const float* __restrict__ bias,
+      const float* __restrict__ scale,
+      int M,
+      int N,
+      int K,
+      int lda,
+      int ldb,
+      int ldc) {
+    TORCH_CHECK(false, "struct brgemm: primary template not implemented!");
+  }
+};
+
+template <bool has_bias>
+struct brgemm<at::BFloat16, at::Float8_e4m3fn, has_bias> {
+  static inline void apply(
+      const at::BFloat16* __restrict__ A,
+      const at::Float8_e4m3fn* __restrict__ B,
+      at::BFloat16* __restrict__ C,
+      at::BFloat16* __restrict__ Btmp,
+      float* __restrict__ Ctmp,
+      const float* __restrict__ bias,
+      const float* __restrict__ scale,
+      int M,
+      int N,
+      int K,
+      int lda,
+      int ldb,
+      int ldc) {
+
+    constexpr int BLOCK_N = block_size_n();
+
+    // [K, BLOCK_N] -> [K / 2, BLOCK_N * 2]
+    const int ldb_tmp = BLOCK_N;
+
+    for (int k = 0; k < K; k += BLOCK_K) {
+      int kb_size = std::min(BLOCK_K, K - k);
+
+      int idx = k >> 7; // k / BLOCK_K where BLOCK_K = 128
+      unpack_B(Btmp + k * ldb_tmp, B + k * ldb, N, kb_size, ldb, ldb_tmp, scale[idx]);
+    }
+
+    at::native::cpublas::brgemm(
+        M, N, K, lda, ldb_tmp, BLOCK_N, /* add_C */ false, A, Btmp, Ctmp);
+
+    // copy from Ctmp to C
+    for (int m = 0; m < M; ++m) {
+      if constexpr (has_bias) {
+        copy_add_stub(C + m * ldc, Ctmp + m * BLOCK_N, bias, N);
+      } else {
+        copy_stub(C + m * ldc, Ctmp + m * BLOCK_N, N);
+      }
+    }
+  }
+};
+
+template <typename scalar_t, bool has_bias>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const at::Float8_e4m3fn* __restrict__ B,
+    scalar_t* __restrict__ C,
+    scalar_t* __restrict__ Btmp,
+    float* __restrict__ Ctmp,
+    const float* __restrict__ scale,
+    const float* __restrict__ bias,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg,
+    int64_t block_size_K) {
+
+  if (brg) {
+    brgemm<scalar_t, at::Float8_e4m3fn, has_bias>::apply(
+        A, B, C, Btmp, Ctmp, bias, scale, M, N, K, lda, ldb, ldc);
+    return;
+  }
+
+  // pattern: 1-4-16
+  constexpr int64_t BLOCK_M = 4;
+  constexpr int64_t BLOCK_N = 64;
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+  for (int mb = 0; mb < MB; ++mb) {
+    int64_t mb_start = mb * BLOCK_M;
+    int64_t mb_size = std::min(BLOCK_M, M - mb_start);
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch(mb_size << 4 | nb_size >> 4) {
+        case 0x12: LAUNCH_TINYGEMM_KERNEL_NN(1, 32); break;
+        case 0x22: LAUNCH_TINYGEMM_KERNEL_NN(2, 32); break;
+        case 0x32: LAUNCH_TINYGEMM_KERNEL_NN(3, 32); break;
+        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+void fp8_scaled_mm_kernel_impl(
+    scalar_t* __restrict__ out,
+    const scalar_t* __restrict__ mat1,
+    const at::Float8_e4m3fn* __restrict__ mat2,
+    const float* __restrict__ scales2,
+    const float* __restrict__ bias,
+    scalar_t* __restrict__ buffer,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t mat1_strideM,
+    int64_t out_strideM,
+    int64_t block_size_N,
+    int64_t block_size_K,
+    int64_t buffer_size_per_thread) {
+
+  constexpr int64_t BLOCK_M = block_size_m() * BLOCK_SIZE_M_SCALE;
+  constexpr int64_t BLOCK_N = block_size_n();
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  const int64_t scale_size_K = div_up(K, block_size_K);
+  const int64_t blocks_n_per_group = block_size_N / BLOCK_N;
+
+  const bool use_brgemm = can_use_brgemm<at::Float8_e4m3fn>(M);
+
+  // parallel on [MB, NB]
+  AT_DISPATCH_BOOL(bias != nullptr, has_bias, [&] {
+    at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
+      int64_t mb{0}, nb{0};
+      data_index_init(begin, mb, MB, nb, NB);
+
+      int tid = at::get_thread_num();
+      scalar_t* __restrict__ Btmp = buffer + tid * buffer_size_per_thread;
+      float* __restrict__ Ctmp = (float*)((void*)(Btmp + BLOCK_N * K));
+
+      for (int64_t i = begin; i < end; ++i) {
+        UNUSED(i);
+        const float* scale_ptr = scales2 + (nb / blocks_n_per_group) * scale_size_K;
+
+        int64_t mb_start = mb * BLOCK_M;
+        int64_t mb_size = std::min(M - mb_start, BLOCK_M);
+        int64_t nb_start = nb * BLOCK_N;
+        int64_t nb_size = std::min(N - nb_start, BLOCK_N);
+
+        tinygemm_kernel<scalar_t, has_bias>(
+            /*   A            */ mat1 + mb_start * mat1_strideM,
+            /*   B            */ mat2 + nb_start * K, // nb * BLOCK_N * K
+            /*   C            */ out + mb_start * out_strideM + nb_start,
+            /*   Btmp         */ Btmp,
+            /*   Ctmp         */ Ctmp,
+            /*   scale        */ scale_ptr,
+            /*   bias         */ bias + nb_start,
+            /*   M            */ mb_size,
+            /*   N            */ nb_size,
+            /*   K            */ K,
+            /*   lda          */ mat1_strideM,
+            /*   ldb          */ nb_size,
+            /*   ldc          */ out_strideM,
+            /*   brg          */ use_brgemm,
+            /*   block_size_K */ block_size_K);
+
+        // move to the next index
+        data_index_step(mb, MB, nb, NB);
+      }
+
+      if (use_brgemm) {
+        at::native::cpublas::brgemm_release();
+      }
+    });
+  });
+}
+
+} // anonymous namespace
+
+// tinygemm interface
+template <typename scalar_t>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const at::Float8_e4m3fn* __restrict__ B,
+    scalar_t* __restrict__ C,
+    scalar_t* __restrict__ Btmp,
+    float* __restrict__ Ctmp,
+    const float* __restrict__ scale,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg,
+    int64_t block_size_K) {
+  tinygemm_kernel<scalar_t, false>(A, B, C, Btmp, Ctmp, scale, nullptr, M, N, K, lda, ldb, ldc, brg, block_size_K);
+}
+
+#define INSTANTIATE_TINYGEMM_TEMPLATE(TYPE)    \
+  template void tinygemm_kernel<TYPE>(         \
+      const TYPE* __restrict__ A,              \
+      const at::Float8_e4m3fn* __restrict__ B, \
+      TYPE* __restrict__ C,                    \
+      TYPE* __restrict__ Btmp,                 \
+      float* __restrict__ Ctmp,                \
+      const float* __restrict__ scale,         \
+      int64_t M,                               \
+      int64_t N,                               \
+      int64_t K,                               \
+      int64_t lda,                             \
+      int64_t ldb,                             \
+      int64_t ldc,                             \
+      bool brg,                                \
+      int64_t block_size_K)
+
+INSTANTIATE_TINYGEMM_TEMPLATE(at::BFloat16);
+INSTANTIATE_TINYGEMM_TEMPLATE(at::Half);
+
+at::Tensor fp8_scaled_mm_cpu(at::Tensor& mat1, at::Tensor& mat2, at::Tensor& scales2,
+    std::vector<int64_t> block_size, std::optional<at::Tensor>& bias,
+    at::ScalarType out_dtype, bool is_vnni) {
+  RECORD_FUNCTION("sgl-kernel::fp8_scaled_mm_cpu", std::vector<c10::IValue>({mat1, mat2, scales2, block_size, bias}));
+
+  auto packed_w = is_vnni ? mat2 : convert_weight_packed(mat2);
+
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(mat1);
+  CHECK_INPUT(mat2);
+  CHECK_INPUT(scales2);
+  TORCH_CHECK(scales2.scalar_type() == at::kFloat,
+      "fp8_scaled_mm_cpu: expect scales2 to be float32.");
+
+  int64_t M = mat1.size(0);
+  int64_t N = mat2.size(0);
+  int64_t K = mat2.size(1);
+
+  CHECK_EQ(mat1.size(1), K);
+  CHECK_DIM(2, mat1);
+  CHECK_DIM(2, mat2);
+
+  TORCH_CHECK(block_size.size() == 2,
+      "fp8_scaled_mm_cpu: expect block_size.size() to be 2.");
+
+  int64_t block_size_N = block_size[0];
+  int64_t block_size_K = block_size[1];
+
+  constexpr int64_t BLOCK_M = block_size_m() * BLOCK_SIZE_M_SCALE;
+  constexpr int64_t BLOCK_N = block_size_n();
+  TORCH_CHECK(block_size_N % BLOCK_N == 0, "fp8_scaled_mm_cpu: expect block_size_N to be multiples of BLOCK_N");
+  TORCH_CHECK(block_size_K == BLOCK_K, "fp8_scaled_mm_cpu: expect block_size_K equals to BLOCK_K");
+  CHECK_EQ(scales2.size(0), div_up(N, block_size_N));
+  CHECK_EQ(scales2.size(1), div_up(K, block_size_K));
+
+  const auto st = mat1.scalar_type();
+  TORCH_CHECK(st == at::kBFloat16 || st == at::kHalf,
+      "fp8_scaled_mm_cpu: expect A to be bfloat16 or half.");
+  TORCH_CHECK(st == out_dtype,
+      "fp8_scaled_mm_cpu: expect A has same dtype with out_dtype.");
+  TORCH_CHECK(mat2.scalar_type() == at::kFloat8_e4m3fn,
+      "fp8_scaled_mm_cpu: expect mat2 to be fp8_e4m3.");
+  TORCH_CHECK(scales2.scalar_type() == at::kFloat,
+      "fp8_scaled_mm_cpu: expect scales to be float32.");
+  auto out = at::empty({M, N}, mat1.options().dtype(out_dtype));
+
+  // strides
+  int64_t mat1_strideM = mat1.stride(0);
+  int64_t out_strideM = out.stride(0);
+
+  const bool has_bias = bias.has_value();
+  const float* bias_data = nullptr;
+  if (has_bias) {
+    CHECK_EQ(bias.value().size(0), N);
+    bias_data = bias.value().data_ptr<float>();
+  }
+
+  // Btmp : [T, BLOCK_N * K]
+  // Ctmp : [T, BLOCK_M * BLOCK_N]
+  int num_threads = at::get_num_threads();
+  int64_t size_per_thread = BLOCK_N * K + BLOCK_M * BLOCK_N * 2;
+  auto buffer = at::empty({num_threads, size_per_thread}, mat1.options());
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(out_dtype, "fp8_scaled_mm_kernel_impl", [&] {
+    fp8_scaled_mm_kernel_impl<scalar_t>(
+        out.data_ptr<scalar_t>(),
+        mat1.data_ptr<scalar_t>(),
+        packed_w.data_ptr<at::Float8_e4m3fn>(),
+        scales2.data_ptr<float>(),
+        bias_data,
+        buffer.data_ptr<scalar_t>(),
+        M,
+        N,
+        K,
+        mat1_strideM,
+        out_strideM,
+        block_size_N,
+        block_size_K,
+        size_per_thread);
+  });
+
+  return out;
+}
diff --git a/csrc/cpu/sgl-kernels/gemm_int8.cpp b/csrc/cpu/sgl-kernels/gemm_int8.cpp
new file mode 100644
index 0000000000000..5a0f65a9200d4
--- /dev/null
+++ b/csrc/cpu/sgl-kernels/gemm_int8.cpp
@@ -0,0 +1,440 @@
+// Adapted from
+// https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
+
+#include "common.h"
+#include "vec.h"
+#include "gemm.h"
+
+// clang-format off
+
+namespace {
+
+template <typename scalar_t, bool has_bias, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn {
+  static inline void apply(
+      const uint8_t* __restrict__ A, const int8_t* __restrict__ B, scalar_t* __restrict__ C,
+      const float* __restrict__ As, const float* __restrict__ Bs, const int32_t* __restrict__ Bcomp,
+      const float* __restrict__ bias, int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+    TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <bool has_bias, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
+  static inline void apply(
+      const uint8_t* __restrict__ A, const int8_t* __restrict__ B, at::BFloat16* __restrict__ C,
+      const float* __restrict__ As, const float* __restrict__ Bs, const int32_t* __restrict__ Bcomp,
+      const float* __restrict__ bias, int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+
+    constexpr int ROWS = BLOCK_M;
+    constexpr int COLS = BLOCK_N / 16;
+    static_assert(COLS % 2 == 0);
+
+    // prefetch distance
+    constexpr int PREFETCH_SIZE_K = 0;
+
+    __m512i va;
+    __m512i vb[COLS];
+    __m512i vc[ROWS * COLS];
+    __m512i vcomp[COLS];
+    __m512  vd0;
+    __m512  vd1[COLS];
+
+    // oops! 4x4 spills but luckly we use 4x2
+    __m512 vbias[COLS];
+
+    // [NOTE]: s8s8 igemm compensation in avx512-vnni
+    //
+    // avx512-vnni has no s8s8, so we need to change s8s8 to u8s8 with compensate:
+    //
+    //   a * b = (a + 128) * b - 128 * b
+    //   s   s       u       s    u    s
+    //
+    // 1) 128 * b is pre-computed when packing B to vnni formats
+    // 2) a + 128 is fused when dynamically quantize A
+    //
+    auto loadc = [&](auto i) {
+      vc[i] = _mm512_set1_epi32(0);
+    };
+    Unroll<ROWS * COLS>{}(loadc);
+
+    const int64_t K4 = K >> 2;
+    const int64_t lda4 = lda >> 2;
+    const int64_t ldb4 = ldb; // ldb * 4 >> 2;
+    const int32_t* a_ptr = reinterpret_cast<const int32_t*>(A);
+    const int32_t* b_ptr = reinterpret_cast<const int32_t*>(B);
+
+    auto compute = [&](auto i, int64_t k) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = _mm512_set1_epi32(a_ptr[row * lda4 + k]);
+      }
+      if constexpr (row == 0) {
+        vb[col] = _mm512_loadu_si512(b_ptr + k * ldb4 + col * 16);
+        if constexpr (PREFETCH_SIZE_K > 0) {
+          _mm_prefetch(b_ptr + (k + PREFETCH_SIZE_K) * ldb4 + col * 16, _MM_HINT_T0);
+        }
+      }
+      vc[i] = _mm512_dpbusd_epi32(vc[i], va, vb[col]);
+    };
+    for (int64_t k = 0; k < K4; ++k) {
+      Unroll<ROWS * COLS>{}(compute, k);
+    }
+
+    auto storec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      // load a scale
+      if constexpr(col == 0) {
+        vd0 = _mm512_set1_ps(As[row]);
+      }
+      // load b scale and vcomp per 2 vectors
+      // also load bias if any
+      if constexpr (row == 0) {
+        if constexpr (col % 2 == 0) {
+          vd1[col + 0] = _mm512_loadu_ps(Bs + col * 16);
+          vd1[col + 1] = _mm512_loadu_ps(Bs + col * 16 + 16);
+          vcomp[col + 0] = _mm512_loadu_si512(Bcomp + col * 16);
+          vcomp[col + 1] = _mm512_loadu_si512(Bcomp + col * 16 + 16);
+          if constexpr (has_bias) {
+            vbias[col + 0] = _mm512_loadu_ps(bias + col * 16);
+            vbias[col + 1] = _mm512_loadu_ps(bias + col * 16 + 16);
+          }
+        }
+      }
+
+      // for COLS = 2, 4 use 512bit store
+      if constexpr (col % 2 == 0) {
+        __m512 vc0 = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc[row * COLS + col + 0], vcomp[col + 0]));
+        __m512 vc1 = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc[row * COLS + col + 1], vcomp[col + 1]));
+        if constexpr (has_bias) {
+          vc0 = _mm512_fmadd_ps(_mm512_mul_ps(vc0, vd0), vd1[col + 0], vbias[col + 0]);
+          vc1 = _mm512_fmadd_ps(_mm512_mul_ps(vc1, vd0), vd1[col + 1], vbias[col + 1]);
+        } else {
+          vc0 = _mm512_mul_ps(_mm512_mul_ps(vc0, vd0), vd1[col + 0]);
+          vc1 = _mm512_mul_ps(_mm512_mul_ps(vc1, vd0), vd1[col + 1]);
+        }
+
+        _mm512_storeu_si512(
+            reinterpret_cast<__m512i*>((C + row * ldc + col * 16)),
+            (__m512i)(_mm512_cvtne2ps_pbh(vc1, vc0)));
+      }
+    };
+    Unroll<ROWS * COLS>{}(storec);
+  }
+};
+#endif
+
+#define LAUNCH_TINYGEMM_KERNEL_NN(MB_SIZE, NB_SIZE)                          \
+    tinygemm_kernel_nn<scalar_t, has_bias, MB_SIZE, NB_SIZE>::apply(         \
+        A + mb_start * lda, B + nb_start * 4, C + mb_start * ldc + nb_start, \
+        As + mb_start, Bs + nb_start, Bcomp + nb_start,                      \
+        has_bias ? bias + nb_start : nullptr, K, lda, ldb, ldc);
+
+template <typename scalar_t, bool has_bias>
+void tinygemm_kernel(
+    const uint8_t* __restrict__ A,
+    const int8_t* __restrict__ B,
+    scalar_t* __restrict__ C,
+    int32_t* __restrict__ Ctmp,
+    const float* __restrict__ As,
+    const float* __restrict__ Bs,
+    const float* __restrict__ bias,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg) {
+
+  // B compensation
+  const int32_t* Bcomp = reinterpret_cast<const int32_t*>(B + block_size_n() * K);
+
+  // pattern: 1-4-16
+  constexpr int64_t BLOCK_M = 4;
+  constexpr int64_t BLOCK_N = 64;
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+  for (int64_t mb = 0; mb < MB; ++mb) {
+    int64_t mb_start = mb * BLOCK_M;
+    int64_t mb_size = std::min(BLOCK_M, M - mb_start);
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch(mb_size << 4 | nb_size >> 4) {
+        // mb_size = 1
+        case 0x12: LAUNCH_TINYGEMM_KERNEL_NN(1, 32); break;
+        case 0x14: LAUNCH_TINYGEMM_KERNEL_NN(1, 64); break;
+        // mb_size = 2
+        case 0x22: LAUNCH_TINYGEMM_KERNEL_NN(2, 32); break;
+        case 0x24: LAUNCH_TINYGEMM_KERNEL_NN(2, 64); break;
+        // mb_size = 3
+        case 0x32: LAUNCH_TINYGEMM_KERNEL_NN(3, 32); break;
+        case 0x34: LAUNCH_TINYGEMM_KERNEL_NN(3, 64); break;
+        // mb_size = 4
+        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
+        case 0x44: LAUNCH_TINYGEMM_KERNEL_NN(4, 64); break;
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+      }
+    }
+  }
+}
+
+template<typename scalar_t>
+void int8_scaled_mm_kernel_impl(
+    scalar_t* __restrict__ out,
+    const uint8_t* __restrict__ mat1,
+    const int8_t* __restrict__ mat2,
+    const float* __restrict__ scales1,
+    const float* __restrict__ scales2,
+    const float* __restrict__ bias,
+    int64_t M,
+    int64_t N,
+    int64_t K) {
+
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  // TODO: brgemm u8s8 depends on PyTorch 2.7 release.
+  const bool use_brgemm = false;
+
+  // K + 4 after compensation
+  const int64_t packed_row_size = get_row_size<int8_t>(K);
+
+  AT_DISPATCH_BOOL(bias != nullptr, has_bias, [&] {
+    at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
+      int64_t mb{0}, nb{0};
+      data_index_init(begin, mb, MB, nb, NB);
+
+      // for brgemm, use int32_t for accumulate
+      alignas(64) int32_t Ctmp[BLOCK_M * BLOCK_N];
+
+      for (int i = begin; i < end; ++i) {
+        UNUSED(i);
+        int mb_start = mb * BLOCK_M;
+        int mb_size = std::min(M - mb_start, BLOCK_M);
+        int nb_start = nb * BLOCK_N;
+        int nb_size = std::min(N - nb_start, BLOCK_N);
+
+        tinygemm_kernel<scalar_t, has_bias>(
+            /*   A */ mat1 + mb_start * K,
+            /*   B */ mat2 + nb_start * packed_row_size /* nb * BLOCK_N * (K + 4) */,
+            /*   C */ out + mb_start * N + nb_start,
+            /* Ctmp*/ Ctmp,
+            /*  As */ scales1 + mb_start,
+            /*  Bs */ scales2 + nb_start,
+            /* bias*/ bias + nb_start,
+            /*   M */ mb_size,
+            /*   N */ nb_size,
+            /*   K */ K,
+            /* lda */ K,
+            /* ldb */ nb_size,
+            /* ldc */ N,
+            /* brg */ use_brgemm);
+
+        // move to the next index
+        data_index_step(mb, MB, nb, NB);
+      }
+
+      if (use_brgemm) {
+        at::native::cpublas::brgemm_release();
+      }
+    });
+  });
+}
+
+} // anonymous namespace
+
+// tinygemm interface
+template <typename scalar_t>
+void tinygemm_kernel(const uint8_t* __restrict__ A, const int8_t* __restrict__ B, scalar_t* __restrict__ C,
+    int32_t* __restrict__ Ctmp,  const float* __restrict__ As, const float* __restrict__ Bs,
+    int64_t M, int64_t N, int64_t K, int64_t lda, int64_t ldb, int64_t ldc, bool brg) {
+  tinygemm_kernel<scalar_t, false>(A, B, C, Ctmp, As, Bs, nullptr, M, N, K, lda, ldb, ldc, brg);
+}
+
+#define INSTANTIATE_TINYGEMM_TEMPLATE(TYPE)                                                     \
+    template void tinygemm_kernel<TYPE>(                                                        \
+        const uint8_t* __restrict__ A, const int8_t* __restrict__ B, TYPE* __restrict__ C,      \
+        int32_t* __restrict__ Ctmp, const float* __restrict__ As, const float* __restrict__ Bs, \
+        int64_t M, int64_t N, int64_t K, int64_t lda, int64_t ldb, int64_t ldc, bool brg)
+
+INSTANTIATE_TINYGEMM_TEMPLATE(at::BFloat16);
+INSTANTIATE_TINYGEMM_TEMPLATE(at::Half);
+
+std::tuple<at::Tensor, at::Tensor> per_token_quant_int8_cpu(at::Tensor& A) {
+  RECORD_FUNCTION("sgl-kernel::per_token_quant_int8_cpu", std::vector<c10::IValue>({A}));
+
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(A);
+  CHECK_DIM(2, A);
+
+  int64_t M = A.size(0);
+  int64_t K = A.size(1);
+  int64_t lda = A.stride(0);
+
+  const auto st = A.scalar_type();
+  TORCH_CHECK(st == at::kBFloat16 || st == at::kHalf,
+      "per_token_quant_int8: expect A to be bfloat16 or half.");
+
+  auto Aq = at::empty({M, K}, A.options().dtype(at::kByte));
+  auto As = at::empty({M}, A.options().dtype(at::kFloat));
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(st, "per_token_quant_int8", [&] {
+    uint8_t* __restrict__ Aq_data = Aq.data_ptr<uint8_t>();
+    float* __restrict__ As_data = As.data_ptr<float>();
+    const scalar_t* __restrict__ A_data = A.data_ptr<scalar_t>();
+
+    at::parallel_for(0, M, 0, [&] (int64_t begin, int64_t end) {
+      for (int64_t m = begin; m < end; ++m) {
+        quantize_row_int8<scalar_t>(
+            Aq_data + m * K,
+            As_data[m],
+            A_data + m * lda,
+            K);
+      }
+    });
+  });
+  return std::make_tuple(Aq, As);
+}
+
+// weight     :  static, per-channel, symmetric
+// activation : dynamic,   per-token, symmetric
+//
+// mat1    : [M, K]
+// mat2    : [N, K]
+// scales1 : [M]
+// scales2 : [N]
+// bias    : [N]
+// out     : [M, N]
+//
+at::Tensor int8_scaled_mm_cpu(at::Tensor& mat1, at::Tensor& mat2,
+    at::Tensor& scales1, at::Tensor& scales2,
+    std::optional<at::Tensor>& bias, at::ScalarType out_dtype, bool is_vnni) {
+  RECORD_FUNCTION("sgl-kernel::int8_scaled_mm_cpu", std::vector<c10::IValue>({mat1, mat2, scales1, scales2, bias}));
+
+  auto packed_w = is_vnni ? mat2 : convert_weight_packed(mat2);
+
+  CHECK_INPUT(mat1);
+  CHECK_INPUT(mat2);
+  CHECK_INPUT(scales1);
+  CHECK_INPUT(scales2);
+  CHECK_DIM(2, mat1);
+  CHECK_DIM(2, mat2);
+
+  int64_t M = mat1.size(0);
+  int64_t N = mat2.size(0);
+  int64_t K = mat1.size(1);
+
+  // see [NOTE]: s8s8 igemm compensation in avx512-vnni
+  CHECK_EQ(mat2.size(1), (int64_t)(is_vnni ? K + sizeof(int32_t) : K));
+  CHECK_EQ(scales1.numel(), M);
+  CHECK_EQ(scales2.numel(), N);
+
+  TORCH_CHECK(mat1.scalar_type() == at::kByte, "int8_scaled_mm: expect mat1 to be uint8.");
+  TORCH_CHECK(mat2.scalar_type() == at::kChar, "int8_scaled_mm: expect mat2 to be int8.");
+  TORCH_CHECK(scales1.scalar_type() == at::kFloat && scales2.scalar_type() == at::kFloat,
+      "int8_scaled_mm: expect scales to be float32.");
+
+  auto out = at::empty({M, N}, mat1.options().dtype(out_dtype));
+
+  const bool has_bias = bias.has_value();
+  const float* bias_data = nullptr;
+  if (has_bias) {
+    CHECK_EQ(bias.value().size(0), N);
+    bias_data = bias.value().data_ptr<float>();
+  }
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(out_dtype, "int8_scaled_mm_kernel_impl", [&] {
+    int8_scaled_mm_kernel_impl<scalar_t>(
+        out.data_ptr<scalar_t>(),
+        mat1.data_ptr<uint8_t>(),
+        packed_w.data_ptr<int8_t>(),
+        scales1.data_ptr<float>(),
+        scales2.data_ptr<float>(),
+        bias_data,
+        M,
+        N,
+        K);
+  });
+  return out;
+}
+
+// fused `per_token_quant_int8_cpu` and `int8_scaled_mm_cpu`
+at::Tensor int8_scaled_mm_with_quant(at::Tensor& mat1, at::Tensor& mat2, at::Tensor& scales2,
+    const std::optional<at::Tensor>& bias, at::ScalarType out_dtype, bool is_vnni) {
+  RECORD_FUNCTION("sgl-kernel::int8_scaled_mm_cpu", std::vector<c10::IValue>({mat1, mat2, scales2, bias}));
+
+  auto packed_w = is_vnni ? mat2 : convert_weight_packed(mat2);
+
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(mat1);
+  CHECK_INPUT(mat2);
+  CHECK_INPUT(scales2);
+  CHECK_DIM(2, mat1);
+  CHECK_DIM(2, mat2);
+
+  int64_t M = mat1.size(0);
+  int64_t N = mat2.size(0);
+  int64_t K = mat1.size(1);
+  int64_t lda = mat1.stride(0);
+
+  // see [NOTE]: s8s8 igemm compensation in avx512-vnni
+  CHECK_EQ(mat2.size(1), (int64_t)(is_vnni ? K + sizeof(int32_t) : K));
+  CHECK_EQ(scales2.numel(), N);
+
+  const auto st = mat1.scalar_type();
+  TORCH_CHECK(st == at::kBFloat16 || st == at::kHalf,
+      "int8_scaled_mm_with_quant: expect A to be bfloat16 or half.");
+  TORCH_CHECK(st == out_dtype,
+      "int8_scaled_mm_with_quant: expect A has same dtype with out_dtype.");
+  TORCH_CHECK(mat2.scalar_type() == at::kChar,
+      "int8_scaled_mm_with_quant: expect mat2 to be int8.");
+  TORCH_CHECK(scales2.scalar_type() == at::kFloat,
+      "int8_scaled_mm_with_quant: expect scales to be float32.");
+
+  const int64_t buffer_size = M * K + M * sizeof(float);
+  auto buffer = at::empty({buffer_size}, mat1.options().dtype(at::kByte));
+  auto out = at::empty({M, N}, mat1.options().dtype(out_dtype));
+
+  const bool has_bias = bias.has_value();
+  const float* bias_data = nullptr;
+  if (has_bias) {
+    CHECK_EQ(bias.value().size(0), N);
+    bias_data = bias.value().data_ptr<float>();
+  }
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(out_dtype, "int8_scaled_mm_with_quant_kernel_impl", [&] {
+    uint8_t* __restrict__ Aq_data = buffer.data_ptr<uint8_t>();
+    float* __restrict__ As_data = (float*)((void*)(Aq_data + M * K));
+    const scalar_t* __restrict__ A_data = mat1.data_ptr<scalar_t>();
+
+    at::parallel_for(0, M, 0, [&] (int64_t begin, int64_t end) {
+      for (int64_t m = begin; m < end; ++m) {
+        quantize_row_int8<scalar_t>(
+            Aq_data + m * K,
+            As_data[m],
+            A_data + m * lda,
+            K);
+      }
+    });
+
+    int8_scaled_mm_kernel_impl<scalar_t>(
+        out.data_ptr<scalar_t>(),
+        Aq_data,
+        packed_w.data_ptr<int8_t>(),
+        As_data,
+        scales2.data_ptr<float>(),
+        bias_data,
+        M,
+        N,
+        K);
+  });
+  return out;
+}
diff --git a/csrc/cpu/sgl-kernels/moe.cpp b/csrc/cpu/sgl-kernels/moe.cpp
new file mode 100644
index 0000000000000..beeccff783ea0
--- /dev/null
+++ b/csrc/cpu/sgl-kernels/moe.cpp
@@ -0,0 +1,1330 @@
+// Adapted from
+// https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
+
+#include "common.h"
+#include "vec.h"
+#include "gemm.h"
+
+// clang-format off
+
+namespace {
+
+// [NOTE]: Fused MoE kernel with AMX
+//
+//   This file contains implementations for
+//     * `moe_align_block_size`
+//     * `fused_moe`
+//
+//   The functionality is identical to triton kernel, excepts:
+//     * fuse silu_and_mul with gemm1, therefore this kernel
+//       allocates 2 intermediate_caches instead of 3
+//     * add `offsets` in `moe_align_block_size` which keeps track
+//       of starting offset for each M block. this is for keeping
+//       output of silu_and_mul in sorted order, thus load_A for
+//       the 2nd gemm would be contiguous, therefore we can directly
+//       load A from intermediate_cache1.
+//
+//  TODO:
+//     1. tune BLOCK_M and BLOCK_N (BLOCK_N * K fit L2)
+//     2. add prefetch for load A which is indexed access
+//     3. abstract at::native::cpublas::brgemm with WoQ gemm (M = 1 & M != 1)
+//
+
+template <typename scalar_t>
+inline void fill_stub(scalar_t* __restrict__ out, scalar_t val, int64_t size) {
+  using Vec = at::vec::Vectorized<scalar_t>;
+  const Vec data_vec(val);
+  at::vec::map<scalar_t>([data_vec](Vec out) { return out = data_vec; }, out, out, size);
+}
+
+template <typename scalar_t>
+inline void copy_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t size) {
+  using Vec = at::vec::Vectorized<scalar_t>;
+  // no remainder
+  #pragma GCC unroll 4
+  for (int64_t d = 0; d < size; d += Vec::size()) {
+    Vec data = Vec::loadu(input + d);
+    data.store(out + d);
+  }
+}
+
+template <typename scalar_t>
+inline void copy_mul_stub(scalar_t* __restrict__ out, const float* __restrict__ input, float weight, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  const fVec weight_vec = fVec(weight);
+  int64_t d;
+  #pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec data0 = fVec::loadu(input + d) * weight_vec;
+    fVec data1 = fVec::loadu(input + d + fVec::size()) * weight_vec;
+    bVec out_vec = convert_from_float_ext<scalar_t>(data0, data1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] * weight);
+  }
+}
+
+// acc from [topk, K] to [K]
+template <typename scalar_t>
+inline void sum_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t topk, int64_t K) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  if (topk == 1) {
+    // do copy for topk = 1
+    copy_stub(out, input, K);
+  } else {
+    // do sum for topk != 1
+    int64_t d;
+    #pragma GCC unroll 4
+    for (d = 0; d <= K - kVecSize; d += kVecSize) {
+      fVec sum_fvec0 = fVec(0.f);
+      fVec sum_fvec1 = fVec(0.f);
+      for (int t = 0; t < topk; ++t) {
+        bVec x_bvec = bVec::loadu(input + t * K + d);
+        fVec x_fvec0, x_fvec1;
+        std::tie(x_fvec0, x_fvec1) = at::vec::convert_to_float(x_bvec);
+
+        sum_fvec0 += x_fvec0;
+        sum_fvec1 += x_fvec1;
+      }
+      bVec out_bvec = convert_from_float_ext<scalar_t>(sum_fvec0, sum_fvec1);
+      out_bvec.store(out + d);
+    }
+    for (; d < K; ++d) {
+      float sum_val = 0.f;
+      for (int t = 0; t < topk; ++t) {
+        sum_val += static_cast<float>(input[t * K + d]);
+      }
+      out[d] = static_cast<scalar_t>(sum_val);
+    }
+  }
+}
+
+// out = input + input2 * scale
+template <typename scalar_t>
+inline void add_mul_stub(scalar_t* __restrict__ out, const float* __restrict__ input,
+    const scalar_t* __restrict__ input2, float scale, int64_t size) {
+
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  const fVec s_vec = fVec(scale);
+  int64_t d;
+  #pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec x0 = fVec::loadu(input + d);
+    fVec x1 = fVec::loadu(input + d + fVec::size());
+
+    bVec y_bvec = bVec::loadu(input2 + d);
+    fVec y0, y1;
+    std::tie(y0, y1) = at::vec::convert_to_float(y_bvec);
+
+    x0 = x0 + y0 * s_vec;
+    x1 = x1 + y1 * s_vec;
+    bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] + float(input2[d]) * scale);
+  }
+}
+
+template <int BLOCK_M>
+int moe_align_block_size(
+    int32_t* __restrict__ sorted_ids,
+    int32_t* __restrict__ expert_ids,
+    int32_t* __restrict__ topk_ids,
+    int32_t* __restrict__ total_cnts,
+    int32_t* __restrict__ cumsums,
+    int32_t* __restrict__ offsets,
+    int num_experts,
+    int numel,
+    int num_threads) {
+
+  #define T_INDEX(tt) total_cnts + (tt) * num_experts
+
+  // accumulate count of expert ids locally
+  at::parallel_for(0, numel, 0, [&](int begin, int end) {
+    int tid = at::get_thread_num();
+    int32_t* __restrict__ local_cnts = T_INDEX(tid + 1);
+
+    for (int i = begin; i < end; ++i) {
+      local_cnts[topk_ids[i]]++;
+    }
+  });
+
+  using iVec = at::vec::Vectorized<int32_t>;
+  for (int t = 0; t < num_threads; ++t) {
+    at::vec::map2<int32_t>(
+        [](iVec x, iVec y) { return x + y; },
+        T_INDEX(t + 1), T_INDEX(t + 1), T_INDEX(t), num_experts);
+  }
+
+  // the last row holds sums of each experts
+  int32_t* total_cnts_t_1 = T_INDEX(num_threads);
+
+  cumsums[0] = 0;
+  for (int e = 0; e < num_experts; ++e) {
+    // accumulate `num_tokens_post_pad`, also as the expert offset
+    cumsums[e + 1] = cumsums[e] + div_up(total_cnts_t_1[e], BLOCK_M) * BLOCK_M;
+
+    for (int k = cumsums[e]; k < cumsums[e + 1]; k += BLOCK_M) {
+      expert_ids[k / BLOCK_M] = e;
+    }
+  }
+  int num_tokens_post_pad = cumsums[num_experts];
+
+  at::parallel_for(0, numel, 0, [&](int begin, int end) {
+    int tid = at::get_thread_num();
+    // thread tid offsets in `total_cnts`
+    int32_t* __restrict__ offsets = T_INDEX(tid);
+
+    for (int i = begin; i < end; ++i) {
+      int32_t expert_id = topk_ids[i];
+      int32_t b_offset = cumsums[expert_id];
+      int32_t t_offset = offsets[expert_id];
+      sorted_ids[b_offset + t_offset] = i;
+      offsets[expert_id]++;
+    }
+  });
+
+  // debug: the offset for thread t_1 should be identical to t_2
+  int32_t* total_cnts_t_2 = T_INDEX(num_threads - 1);
+  for (int e = 0; e < num_experts; ++e) {
+    TORCH_CHECK(total_cnts_t_1[e] == total_cnts_t_2[e]);
+  }
+
+  // padding value for sorted_ids: numel
+  auto sorted_id_size = [=](const int32_t* sorted_ids_ptr) {
+    for (int d = 0; d < BLOCK_M; ++d) {
+      if (sorted_ids_ptr[d] == numel) { return d; }
+    }
+    return BLOCK_M;
+  };
+
+  // offsets holds starting offset for each valida M blocks
+  //   shape : [num_token_blocks + 1]
+  offsets[0] = 0;
+  const int num_token_blocks = num_tokens_post_pad / BLOCK_M;
+  at::parallel_for(0, num_token_blocks, GRAIN_SIZE / BLOCK_M, [&](int begin, int end) {
+    for (int mb = begin; mb < end; ++mb) {
+      offsets[mb + 1] = sorted_id_size(sorted_ids + mb * BLOCK_M);
+    }
+  });
+  // TODO: do we need to vecterize this ?
+  for (int mb = 0; mb < num_token_blocks; ++mb) {
+    offsets[mb + 1] += offsets[mb];
+  }
+  // debug: the last value of offsets should be `numel`
+  TORCH_CHECK(offsets[num_token_blocks] == numel);
+
+  return num_tokens_post_pad;
+}
+
+//   silu :    shape          leading dimension
+//  input0  [m_size, BLOCK_N]    BLOCK_N
+//  input1  [m_size, BLOCK_N]    BLOCK_N
+//  output  [M * topk, N]          N
+template <typename scalar_t, int BLOCK_N>
+inline void silu_and_mul(
+    scalar_t* __restrict__ output,
+    const float* __restrict__ input0,  // x: x0, x1
+    const float* __restrict__ input1,  // y: y0, y1
+    int64_t m_size,
+    int64_t N) {
+
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+
+  const fVec one = fVec(1.f);
+
+  // no remainder
+  for (int64_t m = 0; m < m_size; ++m) {
+    scalar_t* __restrict__ out = output + m * N;
+    const float* __restrict__ x = input0 + m * BLOCK_N;
+    const float* __restrict__ y = input1 + m * BLOCK_N;
+
+    for (int64_t d = 0; d < BLOCK_N; d += bVec::size()) {
+      fVec x0 = fVec::loadu(x + d);
+      fVec x1 = fVec::loadu(x + d + fVec::size());
+      fVec y0 = fVec::loadu(y + d);
+      fVec y1 = fVec::loadu(y + d + fVec::size());
+      // silu
+      x0 = x0 / (one + x0.neg().exp_u20());
+      x1 = x1 / (one + x1.neg().exp_u20());
+      // mul
+      x0 = x0 * y0;
+      x1 = x1 * y1;
+      // convert
+      bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
+      out_vec.store(out + d);
+    }
+  }
+}
+
+template <typename scalar_t, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn2 {
+  static inline void apply(
+      const scalar_t* __restrict__ A, const scalar_t* __restrict__ B0, const scalar_t* __restrict__ B1,
+      scalar_t* __restrict__ C, int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+    TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn2<at::BFloat16, BLOCK_M, BLOCK_N> {
+  static inline void apply(
+      const at::BFloat16* __restrict__ A, const at::BFloat16* __restrict__ B0, const at::BFloat16* __restrict__ B1,
+      at::BFloat16* __restrict__ C, int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+
+    constexpr int ROWS = BLOCK_M;
+    constexpr int COLS = BLOCK_N / 16;
+
+    static_assert(COLS % 2 == 0);
+
+    // prefetch distance
+    constexpr int PREFETCH_SIZE_K = 0;
+
+    __m512bh va;
+    __m512bh vb0[COLS];
+    __m512bh vb1[COLS];
+    __m512 vc0[ROWS * COLS];
+    __m512 vc1[ROWS * COLS];
+
+    auto loadc = [&](auto i) {
+      vc0[i] = _mm512_set1_ps(0.f);
+      vc1[i] = _mm512_set1_ps(0.f);
+    };
+    Unroll<ROWS * COLS>{}(loadc);
+
+    const int64_t K2 = K >> 1;
+    const int64_t lda2 = lda >> 1;
+    const int64_t ldb2 = ldb; // ldb * 2 >> 1;
+    const float* a_ptr = reinterpret_cast<const float*>(A);
+    const float* b0_ptr = reinterpret_cast<const float*>(B0);
+    const float* b1_ptr = reinterpret_cast<const float*>(B1);
+
+    auto compute = [&](auto i, int64_t k) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = (__m512bh)(_mm512_set1_ps(a_ptr[row * lda2 + k]));
+      }
+      if constexpr (row == 0) {
+        vb0[col] = (__m512bh)(_mm512_loadu_si512(b0_ptr + k * ldb2 + col * 16));
+        vb1[col] = (__m512bh)(_mm512_loadu_si512(b1_ptr + k * ldb2 + col * 16));
+        if constexpr (PREFETCH_SIZE_K > 0) {
+          _mm_prefetch(b0_ptr + (k + PREFETCH_SIZE_K) * ldb2 + col * 16, _MM_HINT_T0);
+          _mm_prefetch(b1_ptr + (k + PREFETCH_SIZE_K) * ldb2 + col * 16, _MM_HINT_T0);
+        }
+      }
+      vc0[i] = _mm512_dpbf16_ps(vc0[i], va, vb0[col]);
+      vc1[i] = _mm512_dpbf16_ps(vc1[i], va, vb1[col]);
+    };
+    for (int64_t k = 0; k < K2; ++k) {
+      Unroll<ROWS * COLS>{}(compute, k);
+    }
+
+    using Vec = at::vec::Vectorized<float>;
+    const Vec one = Vec(1.f);
+    auto storec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+      // for COLS = 2, 4 use 512bit store
+      if constexpr (col % 2 == 0) {
+        Vec x0 = vc0[row * COLS + col + 0];
+        Vec x1 = vc0[row * COLS + col + 1];
+        Vec y0 = vc1[row * COLS + col + 0];
+        Vec y1 = vc1[row * COLS + col + 1];
+        // silu
+        x0 = x0 / (one + x0.neg().exp_u20());
+        x1 = x1 / (one + x1.neg().exp_u20());
+        // mul
+        x0 = x0 * y0;
+        x1 = x1 * y1;
+
+        _mm512_storeu_si512(
+            reinterpret_cast<__m512i*>((C + row * ldc + col * 16)),
+            (__m512i)(_mm512_cvtne2ps_pbh(__m512(x1), __m512(x0))));
+        }
+    };
+    Unroll<ROWS * COLS>{}(storec);
+  }
+};
+#endif
+
+#define LAUNCH_TINYGEMM_KERNEL_NN(MB_SIZE, NB_SIZE)                          \
+    tinygemm_kernel_nn2<scalar_t, MB_SIZE, NB_SIZE>::apply(                  \
+        A + mb_start * lda, B0 + nb_start * 2, B1 + nb_start * 2,            \
+        C + mb_start * ldc + nb_start, K, lda, ldb, ldc);
+
+template <typename scalar_t>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const scalar_t* __restrict__ B0,
+    const scalar_t* __restrict__ B1,
+    scalar_t* __restrict__ C,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc) {
+
+  // pattern: 1-(2+2)-(8+8)
+  constexpr int64_t BLOCK_M = 4;
+  constexpr int64_t BLOCK_N = 32;
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+  for (int mb = 0; mb < MB; ++mb) {
+    int64_t mb_start = mb * BLOCK_M;
+    int64_t mb_size = std::min(BLOCK_M, M - mb_start);
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch(mb_size << 4 | nb_size >> 4) {
+        // mb_size = 1
+        case 0x12: LAUNCH_TINYGEMM_KERNEL_NN(1, 32); break;
+        // mb_size = 2
+        case 0x22: LAUNCH_TINYGEMM_KERNEL_NN(2, 32); break;
+        // mb_size = 3
+        case 0x32: LAUNCH_TINYGEMM_KERNEL_NN(3, 32); break;
+        // mb_size = 4
+        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+      }
+    }
+  }
+}
+
+template <typename scalar_t, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn {
+  static inline void apply(
+      const scalar_t* __restrict__ A, const scalar_t* __restrict__ B, float* __restrict__ C,
+      int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+    TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn<at::BFloat16, BLOCK_M, BLOCK_N> {
+  static inline void apply(
+      const at::BFloat16* __restrict__ A, const at::BFloat16* __restrict__ B, float* __restrict__ C,
+      int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+
+    constexpr int ROWS = BLOCK_M;
+    constexpr int COLS = BLOCK_N / 16;
+
+    static_assert(COLS % 2 == 0);
+
+    // prefetch distance
+    constexpr int PREFETCH_SIZE_K = 0;
+
+    __m512bh va;
+    __m512bh vb[COLS];
+    __m512 vc[ROWS * COLS];
+
+    auto loadc = [&](auto i) {
+      vc[i] = _mm512_set1_ps(0.f);
+    };
+    Unroll<ROWS * COLS>{}(loadc);
+
+    const int64_t K2 = K >> 1;
+    const int64_t lda2 = lda >> 1;
+    const int64_t ldb2 = ldb; // ldb * 2 >> 1;
+    const float* a_ptr = reinterpret_cast<const float*>(A);
+    const float* b_ptr = reinterpret_cast<const float*>(B);
+
+    auto compute = [&](auto i, int64_t k) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = (__m512bh)(_mm512_set1_ps(a_ptr[row * lda2 + k]));
+      }
+      if constexpr (row == 0) {
+        vb[col] = (__m512bh)(_mm512_loadu_si512(b_ptr + k * ldb2 + col * 16));
+        if constexpr (PREFETCH_SIZE_K > 0) {
+          _mm_prefetch(b_ptr + (k + PREFETCH_SIZE_K) * ldb2 + col * 16, _MM_HINT_T0);
+        }
+      }
+      vc[i] = _mm512_dpbf16_ps(vc[i], va, vb[col]);
+    };
+    for (int64_t k = 0; k < K2; ++k) {
+      Unroll<ROWS * COLS>{}(compute, k);
+    }
+
+    auto storec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+      _mm512_storeu_ps(reinterpret_cast<__m512*>(C + row * ldc + col * 16), vc[i]);
+
+    };
+    Unroll<ROWS * COLS>{}(storec);
+  }
+};
+#endif
+
+#define LAUNCH_TINYGEMM_KERNEL_NN2(MB_SIZE, NB_SIZE)                         \
+    tinygemm_kernel_nn<scalar_t, MB_SIZE, NB_SIZE>::apply(                   \
+        A + mb_start * lda, B + nb_start * 2, C + mb_start * ldc + nb_start, \
+        K, lda, ldb, ldc);
+
+template <typename scalar_t>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const scalar_t* __restrict__ B,
+    float* __restrict__ C,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc) {
+
+  // pattern: 1-2-8
+  constexpr int64_t BLOCK_M = 4;
+  constexpr int64_t BLOCK_N = 32;
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+  for (int mb = 0; mb < MB; ++mb) {
+    int64_t mb_start = mb * BLOCK_M;
+    int64_t mb_size = std::min(BLOCK_M, M - mb_start);
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch(mb_size << 4 | nb_size >> 4) {
+        // mb_size = 1
+        case 0x12: LAUNCH_TINYGEMM_KERNEL_NN2(1, 32); break;
+        // mb_size = 2
+        case 0x22: LAUNCH_TINYGEMM_KERNEL_NN2(2, 32); break;
+        // mb_size = 3
+        case 0x32: LAUNCH_TINYGEMM_KERNEL_NN2(3, 32); break;
+        // mb_size = 4
+        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN2(4, 32); break;
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+void fused_experts_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ ic2,
+    scalar_t* __restrict__ A_tmp,
+    float* __restrict__ C_tmp,
+    const scalar_t* __restrict__ input,
+    const scalar_t* __restrict__ packed_w1,
+    const scalar_t* __restrict__ packed_w2,
+    const float* __restrict__ topk_weights,
+    const int32_t* __restrict__ sorted_ids,
+    const int32_t* __restrict__ expert_ids,
+    const int32_t* __restrict__ offsets,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t E,
+    int64_t topk,
+    int64_t num_tokens_post_pad) {
+
+  // handle 2 tiles per block
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+
+  // stage 1: intermediate_cache1 = silu(hidden_states @ w1)
+  const int64_t MB = div_up(num_tokens_post_pad, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  // strides for w1: [E, 2N, K]
+  TORCH_CHECK(N % BLOCK_N == 0, "Fixme when N is not multiples of ", BLOCK_N);
+
+  const int64_t stride_e = 2 * N * K;
+  const int64_t stride_n = K;
+
+  // here we only parallel on half of 2N to fuse silu_and_mul with gemm
+  at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
+    // get local pointers
+    int tid = at::get_thread_num();
+    scalar_t* __restrict__ A = A_tmp + tid * BLOCK_M * K;
+    float* __restrict__ C0 = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
+    float* __restrict__ C1 = C0 + BLOCK_M * BLOCK_N;
+
+    bool is_brgemm_used = false;
+
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB;
+      int64_t nb = i % NB;
+
+      // nb0 from top half and nb1 from bottom half
+      int64_t nb0 = nb, nb1 = nb + NB;
+      int64_t n_size = std::min(N - nb0 * BLOCK_N, BLOCK_N);
+
+      // B shape [K, n_size] in vnni format
+      int32_t expert_id = expert_ids[mb];
+      const scalar_t* __restrict__ B0 = packed_w1 + expert_id * stride_e + nb0 * BLOCK_N * stride_n;
+      const scalar_t* __restrict__ B1 = packed_w1 + expert_id * stride_e + nb1 * BLOCK_N * stride_n;
+
+      // 1.a load A
+      const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
+      int64_t m_size = offsets[mb + 1] - offsets[mb];
+
+      const bool use_brgemm = can_use_brgemm<scalar_t>(m_size);
+      is_brgemm_used = is_brgemm_used || use_brgemm;
+
+      for (int64_t m = 0; m < m_size; ++m) {
+        int32_t index = A_ids[m] / topk;
+        copy_stub(A + m * K, input + index * K, K);
+      }
+
+      if (use_brgemm) {
+        // 1.b gemm: C0 = A @ B0
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B0,
+            /* C     */ C0);
+
+        // 1.c gemm: C1 = A @ B1
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B1,
+            /* C     */ C1);
+
+        // 1.d silu and mul
+        const int64_t offset = offsets[mb];
+        silu_and_mul<scalar_t, BLOCK_N>(
+            ic1 + offset * N + nb * BLOCK_N,
+            C0,
+            C1,
+            m_size,
+            N);
+      } else {
+        // fused 1.bcd: silu_and_mul(A @ B0, A @ B1)
+        const int64_t offset = offsets[mb];
+        tinygemm_kernel(
+            /* A     */ A,
+            /* B0    */ B0,
+            /* B1    */ B1,
+            /* C     */ ic1 + offset * N + nb * BLOCK_N,
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ N);
+      }
+    }
+
+    if (is_brgemm_used) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+
+  // stage 2: intermediate_cache2 = intermediate_cache1 @ w2
+  //   w2 : [E, K, N] as [E, OC, IC]
+  const int64_t OC = K;  // rename K as OC
+  const int64_t IC = N;  // rename N as IC
+  const int64_t MB2 = MB;
+  const int64_t NB2 = div_up(OC, BLOCK_N);
+  const int64_t stride_e2 = OC * IC;
+  const int64_t stride_oc = IC;
+
+  // parallel on [MB2, NB2]
+  at::parallel_for(0, MB2 * NB2, 0, [&](int64_t begin, int64_t end) {
+    // get local pointers
+    int tid = at::get_thread_num();
+    // we won't be using C1 for gemm2
+    float* __restrict__ C = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
+
+    bool is_brgemm_used = false;
+
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB2;
+      int64_t nb = i % NB2;
+
+      int64_t m_size = offsets[mb + 1] - offsets[mb];
+      int64_t n_size = std::min(OC - nb * BLOCK_N, BLOCK_N);
+
+      const bool use_brgemm = can_use_brgemm<scalar_t>(m_size);
+      is_brgemm_used = is_brgemm_used || use_brgemm;
+
+      // A ptr from ic1 of [M * topk, N] in sorted order
+      // so as to avoid copy A to tmp buffer again
+      const scalar_t* __restrict__ A = ic1 + offsets[mb] * N;
+      const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
+
+      // B shape [IC, n_size] in vnni format
+      int32_t expert_id = expert_ids[mb];
+      const scalar_t* __restrict__ B = packed_w2 + expert_id * stride_e2 + nb * BLOCK_N * stride_oc;
+
+      // 2.a gemm: C = A @ B
+      if (use_brgemm) {
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ IC,
+            /* lda   */ IC,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B,
+            /* C     */ C);
+      } else {
+        tinygemm_kernel(
+            /* A     */ A,
+            /* B     */ B,
+            /* C     */ C,
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ IC,
+            /* lda   */ IC,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N);
+      }
+
+      // 2.b copy from C to ic2 in original order
+      //   and also mul topk_weights in float32
+      for (int64_t m = 0; m < m_size; ++m) {
+        int32_t index = A_ids[m];
+        float weight = topk_weights[index];
+        copy_mul_stub(ic2 + index * K + nb * BLOCK_N, C + m * BLOCK_N, weight, n_size);
+      }
+    }
+
+    if (is_brgemm_used) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+
+  // stage 3: out = intermediate_cache2.sum(dim=1)
+  //   from [M, topk, K] to [M, K]
+  at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      sum_stub(output + m * K, ic2 + m * topk * K, topk, K);
+    }
+  });
+}
+
+template <typename scalar_t>
+void shared_expert_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic1,
+    float* __restrict__ C_tmp,
+    scalar_t* __restrict__ input,
+    const scalar_t* __restrict__ packed_w1,
+    const scalar_t* __restrict__ packed_w2,
+    const scalar_t* __restrict__ fused_experts_out,
+    float routed_scaling_factor,
+    int64_t M,
+    int64_t N,
+    int64_t K) {
+
+  // handle 2 tiles per block
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+
+  // stage 1: intermediate_cache1 = silu(hidden_states @ w1)
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  TORCH_CHECK(N % BLOCK_N == 0, "Fixme when N is not multiples of ", BLOCK_N);
+  const int64_t stride_n = K;
+
+  // here we only parallel on half of 2N to fuse silu_and_mul with gemm
+  at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
+    // get local pointers
+    int tid = at::get_thread_num();
+    float* __restrict__ C0 = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
+    float* __restrict__ C1 = C0 + BLOCK_M * BLOCK_N;
+
+    bool is_brgemm_used = false;
+
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB;
+      int64_t nb = i % NB;
+
+      // nb0 from top half and nb1 from bottom half
+      int64_t nb0 = nb, nb1 = nb + NB;
+      int64_t n_size = std::min(N - nb0 * BLOCK_N, BLOCK_N);
+      int64_t m_size = std::min(M - mb * BLOCK_M, BLOCK_M);
+
+      //int64_t mb_start = mb * BLOCK_M;
+      //int64_t mb_size = std::min(M - mb_start, BLOCK_M);
+
+      // A shape [m_size, K]
+      const scalar_t* A = input + mb * BLOCK_M * K;
+
+      // B shape [K, n_size] in vnni format
+      const scalar_t* __restrict__ B0 = packed_w1 + nb0 * BLOCK_N * stride_n;
+      const scalar_t* __restrict__ B1 = packed_w1 + nb1 * BLOCK_N * stride_n;
+
+      const bool use_brgemm = can_use_brgemm<scalar_t>(m_size);
+      is_brgemm_used = is_brgemm_used || use_brgemm;
+
+      if (use_brgemm) {
+        // 1.b gemm: C0 = A @ B0
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B0,
+            /* C     */ C0);
+
+        // 1.c gemm: C1 = A @ B1
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B1,
+            /* C     */ C1);
+
+        // 1.d silu and mul
+        silu_and_mul<scalar_t, BLOCK_N>(
+            ic1 + mb * BLOCK_M * N + nb * BLOCK_N,
+            C0,
+            C1,
+            m_size,
+            N);
+      } else {
+        // fused 1.bcd: silu_and_mul(A @ B0, A @ B1)
+        tinygemm_kernel(
+            /* A     */ A,
+            /* B0    */ B0,
+            /* B1    */ B1,
+            /* C     */ ic1 + mb * BLOCK_M * N + nb * BLOCK_N,
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ N);
+      }
+    }
+
+    if (is_brgemm_used) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+
+  // stage 2: output = intermediate_cache1 @ w2
+  //   w2 : [K, N] as [OC, IC]
+  const int64_t OC = K;  // rename K as OC
+  const int64_t IC = N;  // rename N as IC
+  const int64_t MB2 = MB;
+  const int64_t NB2 = div_up(OC, BLOCK_N);
+  const int64_t stride_oc = IC;
+
+  // parallel on [MB2, NB2]
+  at::parallel_for(0, MB2 * NB2, 0, [&](int64_t begin, int64_t end) {
+    // get local pointers
+    int tid = at::get_thread_num();
+    // we won't be using C1 for gemm2
+    float* __restrict__ C = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
+
+    bool is_brgemm_used = false;
+
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB2;
+      int64_t nb = i % NB2;
+
+      int64_t m_size = std::min(M - mb * BLOCK_M, BLOCK_M);
+      int64_t n_size = std::min(OC - nb * BLOCK_N, BLOCK_N);
+
+      const bool use_brgemm = can_use_brgemm<scalar_t>(m_size);
+      is_brgemm_used = is_brgemm_used || use_brgemm;
+
+      // A shape [m_size, IC]
+      const scalar_t* __restrict__ A = ic1 + mb * BLOCK_M * N;
+
+      // B shape [IC, n_size] in vnni format
+      const scalar_t* __restrict__ B = packed_w2 + nb * BLOCK_N * stride_oc;
+
+      // 2.a gemm: C = A @ B
+      if (use_brgemm) {
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ IC,
+            /* lda   */ IC,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B,
+            /* C     */ C);
+      } else {
+        tinygemm_kernel(
+            /* A     */ A,
+            /* B     */ B,
+            /* C     */ C,
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ IC,
+            /* lda   */ IC,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N);
+      }
+
+      // 2.b copy from C to output and add fused_experts_out
+      scalar_t* __restrict__ out = output + mb * BLOCK_M * K + nb * BLOCK_N;
+      const scalar_t* __restrict__ fused_out = fused_experts_out + mb * BLOCK_M * K + nb * BLOCK_N;
+      for (int64_t m = 0; m < m_size; ++m) {
+        add_mul_stub(out + m * K, C + m * BLOCK_N, fused_out + m * K, routed_scaling_factor, n_size);
+      }
+    }
+
+    if (is_brgemm_used) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+}
+
+} // anonymous namespace
+
+// common checks
+static inline void check_moe_scales(
+    bool use_int8_w8a8,
+    bool use_fp8_w8a16,
+    const std::optional<at::Tensor>& w1_scale,
+    const std::optional<at::Tensor>& w2_scale,
+    const std::optional<std::vector<int64_t>> block_size,
+    const std::optional<at::Tensor>& a1_scale,
+    const std::optional<at::Tensor>& a2_scale) {
+  if (use_int8_w8a8) {
+    TORCH_CHECK(w1_scale.has_value(), "missing w1_scale for int8 w8a8.");
+    TORCH_CHECK(w2_scale.has_value(), "missing w2_scale for int8 w8a8.");
+    TORCH_CHECK(!a1_scale.has_value(), "static quantization for activation not supported.");
+    TORCH_CHECK(!a2_scale.has_value(), "static quantization for activation not supported.");
+  }
+  if (use_fp8_w8a16) {
+    TORCH_CHECK(w1_scale.has_value(), "missing w1_scale for fp8 w8a16.");
+    TORCH_CHECK(w2_scale.has_value(), "missing w2_scale for fp8 w8a16.");
+    TORCH_CHECK(block_size.has_value(), "missing block_size for fp8 w8a16.");
+    TORCH_CHECK(block_size.value().size() == 2, "expect block_size.size() to be 2.");
+  }
+}
+
+#define CHECK_MOE_SCALES_FP8(DIM0, DIM1)                 \
+    auto w1s = w1_scale.value();                         \
+    auto w2s = w2_scale.value();                         \
+    auto block_size_val = block_size.value();            \
+    int64_t block_size_N = block_size_val[0];            \
+    int64_t block_size_K = block_size_val[1];            \
+    TORCH_CHECK(w1s.size(DIM0) == 2 * N / block_size_N); \
+    TORCH_CHECK(w1s.size(DIM1) == K / block_size_K);     \
+    TORCH_CHECK(w2s.size(DIM0) == K / block_size_N);     \
+    TORCH_CHECK(w2s.size(DIM1) == N / block_size_K)
+
+// hidden_states: [M, K]
+// w1: [E, 2N, K]
+// w2: [E, K, N]
+// topk_weights: [M, topk]
+// topk_ids: [M, topk] (int32_t)
+//
+at::Tensor fused_experts_cpu(
+    at::Tensor& hidden_states,
+    at::Tensor& w1,
+    at::Tensor& w2,
+    at::Tensor& topk_weights,
+    at::Tensor& topk_ids,
+    bool inplace,
+    bool use_int8_w8a8,
+    bool use_fp8_w8a16,
+    const std::optional<at::Tensor>& w1_scale,
+    const std::optional<at::Tensor>& w2_scale,
+    const std::optional<std::vector<int64_t>> block_size,
+    const std::optional<at::Tensor>& a1_scale,
+    const std::optional<at::Tensor>& a2_scale,
+    bool is_vnni) {
+  RECORD_FUNCTION("sgl-kernel::fused_experts_cpu", std::vector<c10::IValue>({hidden_states, w1, w2, topk_weights, topk_ids}));
+
+  auto packed_w1 = is_vnni ? w1 : convert_weight_packed(w1);
+  auto packed_w2 = is_vnni ? w2 : convert_weight_packed(w2);
+
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+
+  const auto st = hidden_states.scalar_type();
+  CHECK_INPUT(hidden_states);
+  CHECK_INPUT(w1);
+  CHECK_INPUT(w2);
+  CHECK_EQ(topk_weights.sizes(), topk_ids.sizes());
+  CHECK_DIM(2, hidden_states);
+  CHECK_DIM(3, w1);
+  CHECK_DIM(3, w2);
+  CHECK_DIM(2, topk_weights);
+  CHECK_DIM(2, topk_ids);
+
+  CHECK_EQ(topk_ids.scalar_type(), at::kInt);
+  CHECK_EQ(topk_weights.scalar_type(), at::kFloat);
+
+  int64_t M = hidden_states.size(0);
+  int64_t K = hidden_states.size(1);
+  int64_t N = w1.size(1) / 2;
+  int64_t E = w1.size(0);
+  int64_t topk = topk_weights.size(1);
+
+  // we use int32_t compensation for int8 w8a8
+  int64_t packed_K = get_row_size(K, use_int8_w8a8);
+  int64_t packed_N = get_row_size(N, use_int8_w8a8);
+
+  // check weight shapes
+  CHECK_EQ(w2.size(0), E);
+  CHECK_EQ(w2.size(1), K);
+  CHECK_EQ(packed_w1.size(2), packed_K);
+  CHECK_EQ(packed_w2.size(2), packed_N);
+
+  // check scales
+  check_moe_scales(use_int8_w8a8, use_fp8_w8a16, w1_scale, w2_scale, block_size, a1_scale, a2_scale);
+
+  at::Tensor out_hidden_states = inplace ? hidden_states : at::empty_like(hidden_states);
+
+  // NB: worst case is each expert holds a block with remainder of 1
+  //   1. sorted_ids : [M * topk + E * (BLOCK_M - 1)]
+  //   2. expert_ids : [max_num_blocks]
+  //   3. total_cnts : [T + 1, E]
+  //   4. cumsums    : [E + 1]
+  //   5. offsets    : [max_num_blocks + 1]
+  //
+  int num_threads = at::get_num_threads();
+  int64_t max_num_tokens_padded = M * topk + E * (BLOCK_M - 1);
+  int64_t max_num_blocks = div_up(max_num_tokens_padded, BLOCK_M);
+  auto buffer = at::empty(
+      {max_num_tokens_padded + max_num_blocks + (num_threads + 1) * E + (E + 1) + (max_num_blocks + 1)},
+      topk_ids.options());
+
+  int32_t* __restrict__ sorted_ids = buffer.data_ptr<int32_t>();
+  int32_t* __restrict__ expert_ids = sorted_ids + max_num_tokens_padded;
+  int32_t* __restrict__ total_cnts = expert_ids + max_num_blocks;
+  int32_t* __restrict__ cumsums    = total_cnts + (num_threads + 1) * E;
+  int32_t* __restrict__ offsets    = cumsums    + (E + 1);
+
+  // init sorted_ids with `numel` as the padding number
+  // init expert_ids with `num_experts`
+  int64_t numel = M * topk;
+  at::parallel_for(0, max_num_blocks, GRAIN_SIZE / BLOCK_M, [&](int64_t begin, int64_t end) {
+    int64_t m_start = begin * BLOCK_M;
+    int64_t m_size = std::min((end - begin) * BLOCK_M, max_num_tokens_padded - m_start);
+    fill_stub(sorted_ids + m_start, (int32_t)numel, m_size);
+    fill_stub(expert_ids + begin, (int32_t)E, end - begin);
+  });
+  // zero total_cnts and cumsums
+  at::parallel_for(0, (num_threads + 1) * E + (E + 1), GRAIN_SIZE, [&](int64_t begin, int64_t end) {
+    fill_stub(total_cnts + begin, 0, end - begin);
+  });
+
+  // align experts index
+  int64_t num_tokens_post_pad = moe_align_block_size<BLOCK_M>(
+      sorted_ids, expert_ids, topk_ids.data_ptr<int32_t>(), total_cnts, cumsums, offsets, E, numel, num_threads);
+
+  // unlike triton kernel, we fuse silu with gemm1 so only need 2 intermediate_caches:
+  //   1. intermediate_cache1 : [M * topk, N]
+  //   2. intermediate_cache2 : [M * topk, K]
+  //   3. A_tmp : [T, BLOCK_M * K]
+  //   4. C_tmp : [T, 2 * BLOCK_M * BLOCK_N]
+  //
+  // for int8 w8a8:
+  //   5. Aq_tmp : [M, K] or [M * topk, N]
+  //   6. As_tmp : [M * topk]
+  //
+  // for fp8 w8a16:
+  //   7. intermediate_cache0 : [M * topk, 2N]
+  //   8. B_tmp : [T, BLOCK_N, std::max(K, N)]
+  //
+  int64_t buffer_size_nbytes = M * topk * N * 2 + M * topk * K * 2 +
+      num_threads * BLOCK_M * K * (use_int8_w8a8 ? 1 : 2) +
+      num_threads * 2 * BLOCK_M * BLOCK_N * sizeof(float);
+
+  if (use_int8_w8a8) {
+    buffer_size_nbytes += std::max(M * K, M * topk * N) + M * topk * sizeof(float);
+  }
+  if (use_fp8_w8a16) {
+    buffer_size_nbytes += M * topk * 2 * N * 2 + num_threads * BLOCK_N * std::max(K, N) * 2;
+  }
+
+  auto buffer2 = at::empty({buffer_size_nbytes}, hidden_states.options().dtype(at::kChar));
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(st, "fused_experts_kernel_impl", [&] {
+    scalar_t* __restrict__ intermediate_cache1 = (scalar_t*)((void*)(buffer2.data_ptr<int8_t>()));
+    scalar_t* __restrict__ intermediate_cache2 = intermediate_cache1 + M * topk * N;
+
+    if (use_int8_w8a8) {
+      uint8_t* __restrict__ A_tmp = (uint8_t*)((void*)(intermediate_cache2 + M * topk * K));
+      float* __restrict__ C_tmp = (float*)((void*)(A_tmp + num_threads * BLOCK_M * K));
+      uint8_t* __restrict__ Aq_tmp = (uint8_t*)((void*)(C_tmp + num_threads * 2 * BLOCK_M * BLOCK_N));
+      float* __restrict__ As_tmp = (float*)((void*)(Aq_tmp + std::max(M * K, M * topk * N)));
+
+      auto w1s = w1_scale.value();
+      auto w2s = w2_scale.value();
+      TORCH_CHECK(w1s.numel() == E * 2 * N);
+      TORCH_CHECK(w2s.numel() == E * K);
+
+      fused_experts_int8_kernel_impl<scalar_t>(
+          out_hidden_states.data_ptr<scalar_t>(),
+          intermediate_cache1,
+          intermediate_cache2,
+          A_tmp,
+          C_tmp,
+          Aq_tmp,
+          As_tmp,
+          hidden_states.data_ptr<scalar_t>(),
+          packed_w1.data_ptr<int8_t>(),
+          packed_w2.data_ptr<int8_t>(),
+          w1s.data_ptr<float>(),
+          w2s.data_ptr<float>(),
+          topk_weights.data_ptr<float>(),
+          sorted_ids,
+          expert_ids,
+          offsets,
+          M,
+          N,
+          K,
+          E,
+          topk,
+          num_tokens_post_pad);
+    } else if (use_fp8_w8a16) {
+      // here we just ignore C_tmp as it is not used
+      scalar_t* __restrict__ A_tmp = (scalar_t*)((void*)(intermediate_cache2 + M * topk * K));
+      float* __restrict__ C_tmp = (float*)((void*)(A_tmp + num_threads * BLOCK_M * K));
+      scalar_t* __restrict__ intermediate_cache0 = (scalar_t*)((void*)(C_tmp + num_threads * 2 * BLOCK_M * BLOCK_N));
+      scalar_t* __restrict__ B_tmp = (scalar_t*)((void*)(intermediate_cache0 + M * topk * 2 * N));
+
+      CHECK_MOE_SCALES_FP8(1, 2);
+      fused_experts_fp8_kernel_impl(
+          out_hidden_states.data_ptr<scalar_t>(),
+          intermediate_cache0,
+          intermediate_cache1,
+          intermediate_cache2,
+          A_tmp,
+          B_tmp,
+          C_tmp,
+          hidden_states.data_ptr<scalar_t>(),
+          packed_w1.data_ptr<at::Float8_e4m3fn>(),
+          packed_w2.data_ptr<at::Float8_e4m3fn>(),
+          w1s.data_ptr<float>(),
+          w2s.data_ptr<float>(),
+          block_size_N,
+          block_size_K,
+          topk_weights.data_ptr<float>(),
+          sorted_ids,
+          expert_ids,
+          offsets,
+          M,
+          N,
+          K,
+          E,
+          topk,
+          num_tokens_post_pad);
+    } else {
+      scalar_t* __restrict__ A_tmp = intermediate_cache2 + M * topk * K;
+      float* __restrict__ C_tmp = (float*)((void*)(A_tmp + num_threads * BLOCK_M * K));
+
+      fused_experts_kernel_impl<scalar_t>(
+          out_hidden_states.data_ptr<scalar_t>(),
+          intermediate_cache1,
+          intermediate_cache2,
+          A_tmp,
+          C_tmp,
+          hidden_states.data_ptr<scalar_t>(),
+          packed_w1.data_ptr<scalar_t>(),
+          packed_w2.data_ptr<scalar_t>(),
+          topk_weights.data_ptr<float>(),
+          sorted_ids,
+          expert_ids,
+          offsets,
+          M,
+          N,
+          K,
+          E,
+          topk,
+          num_tokens_post_pad);
+    }
+  });
+  return out_hidden_states;
+}
+
+// shared expert kernel
+//
+// hidden_states: [M, K]
+// w1: [2N, K]
+// w2: [K, N]
+// fused_experts_out
+at::Tensor shared_expert_cpu(
+    at::Tensor& hidden_states,
+    at::Tensor& w1,
+    at::Tensor& w2,
+    at::Tensor& fused_experts_out,
+    double routed_scaling_factor,
+    bool inplace,
+    bool use_int8_w8a8,
+    bool use_fp8_w8a16,
+    std::optional<at::Tensor>& w1_scale,
+    std::optional<at::Tensor>& w2_scale,
+    std::optional<std::vector<int64_t>> block_size,
+    std::optional<at::Tensor>& a1_scale,
+    std::optional<at::Tensor>& a2_scale,
+    bool is_vnni) {
+  RECORD_FUNCTION("sgl-kernel::shared_expert_cpu", std::vector<c10::IValue>({hidden_states, w1, w2}));
+
+  auto packed_w1 = is_vnni ? w1 : convert_weight_packed(w1);
+  auto packed_w2 = is_vnni ? w2 : convert_weight_packed(w2);
+
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+
+  const auto st = hidden_states.scalar_type();
+  CHECK_INPUT(hidden_states);
+  CHECK_INPUT(fused_experts_out);
+  CHECK_INPUT(w1);
+  CHECK_INPUT(w2);
+  CHECK_DIM(2, hidden_states);
+  CHECK_DIM(2, w1);
+  CHECK_DIM(2, w2);
+  CHECK_EQ(hidden_states.sizes(), fused_experts_out.sizes());
+  CHECK_EQ(hidden_states.scalar_type(), st);
+
+  int64_t M = hidden_states.size(0);
+  int64_t K = hidden_states.size(1);
+  int64_t N = w1.size(0) / 2;
+
+  // we use int32_t compensation for int8 w8a8
+  int64_t packed_K = get_row_size(K, use_int8_w8a8);
+  int64_t packed_N = get_row_size(N, use_int8_w8a8);
+
+  // check weight shapes
+  CHECK_EQ(w2.size(0), K);
+  CHECK_EQ(packed_w1.size(1), packed_K);
+  CHECK_EQ(packed_w2.size(1), packed_N);
+
+  // check scales
+  check_moe_scales(use_int8_w8a8, use_fp8_w8a16, w1_scale, w2_scale, block_size, a1_scale, a2_scale);
+
+  at::Tensor out_hidden_states = inplace ? hidden_states : at::empty_like(hidden_states);
+
+  // unlike triton kernel, we fuse silu with gemm1 so only need 2 intermediate_caches:
+  //   1. intermediate_cache1 : [M, N]
+  //   2. C_tmp : [T, 2 * BLOCK_M * BLOCK_N]
+  //
+  // for int8 w8a8:
+  //   3. Aq_tmp : [M, K] or [M, N]
+  //   4. As_tmp : [M]
+  //
+  // for fp8 w8a16:
+  //   5. intermediate_cache0 : [M, 2N]
+  //   6. B_tmp: [T, BLOCK_M, max(K, N)]
+  //
+  int num_threads = at::get_num_threads();
+  int64_t buffer_size_nbytes = M * N * 2 + num_threads * 2 * BLOCK_M * BLOCK_N * sizeof(float);
+
+  if (use_int8_w8a8) {
+    buffer_size_nbytes += std::max(M * K, M * N) + M * sizeof(float);
+  }
+  if (use_fp8_w8a16) {
+    buffer_size_nbytes += M * 2 * N * 2 + num_threads * BLOCK_M * std::max(K, N) * 2;
+  }
+
+  auto buffer = at::empty({buffer_size_nbytes}, hidden_states.options().dtype(at::kChar));
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(st, "share_experts_kernel_impl", [&] {
+    scalar_t* __restrict__ intermediate_cache1 = (scalar_t*)((void*)(buffer.data_ptr<int8_t>()));
+    float* __restrict__ C_tmp = (float*)((void*)(intermediate_cache1 + M * N));
+
+    if (use_int8_w8a8) {
+      uint8_t* __restrict__ Aq_tmp = (uint8_t*)((void*)(C_tmp + num_threads * 2 * BLOCK_M * BLOCK_N));
+      float* __restrict__ As_tmp = (float*)((void*)(Aq_tmp + std::max(M * K, M * N)));
+
+      auto w1s = w1_scale.value();
+      auto w2s = w2_scale.value();
+      TORCH_CHECK(w1s.numel() == 2 * N);
+      TORCH_CHECK(w2s.numel() == K);
+
+      shared_expert_int8_kernel_impl<scalar_t>(
+          out_hidden_states.data_ptr<scalar_t>(),
+          intermediate_cache1,
+          C_tmp,
+          Aq_tmp,
+          As_tmp,
+          hidden_states.data_ptr<scalar_t>(),
+          packed_w1.data_ptr<int8_t>(),
+          packed_w2.data_ptr<int8_t>(),
+          w1s.data_ptr<float>(),
+          w2s.data_ptr<float>(),
+          fused_experts_out.data_ptr<scalar_t>(),
+          routed_scaling_factor,
+          M,
+          N,
+          K);
+    } else if (use_fp8_w8a16) {
+      scalar_t* __restrict__ intermediate_cache0 = (scalar_t*)((void*)(C_tmp + num_threads * 2 * BLOCK_M * BLOCK_N));
+      scalar_t* __restrict__ B_tmp = (scalar_t*)((void*)(intermediate_cache0 + M * 2 * N));
+
+      CHECK_MOE_SCALES_FP8(0, 1);
+      shared_expert_fp8_kernel_impl<scalar_t>(
+          out_hidden_states.data_ptr<scalar_t>(),
+          intermediate_cache0,
+          intermediate_cache1,
+          B_tmp,
+          C_tmp,
+          hidden_states.data_ptr<scalar_t>(),
+          packed_w1.data_ptr<at::Float8_e4m3fn>(),
+          packed_w2.data_ptr<at::Float8_e4m3fn>(),
+          w1s.data_ptr<float>(),
+          w2s.data_ptr<float>(),
+          block_size_N,
+          block_size_K,
+          fused_experts_out.data_ptr<scalar_t>(),
+          routed_scaling_factor,
+          M,
+          N,
+          K);
+    } else {
+      shared_expert_kernel_impl<scalar_t>(
+          out_hidden_states.data_ptr<scalar_t>(),
+          intermediate_cache1,
+          C_tmp,
+          hidden_states.data_ptr<scalar_t>(),
+          packed_w1.data_ptr<scalar_t>(),
+          packed_w2.data_ptr<scalar_t>(),
+          fused_experts_out.data_ptr<scalar_t>(),
+          routed_scaling_factor,
+          M,
+          N,
+          K);
+    }
+  });
+  return out_hidden_states;
+}
diff --git a/csrc/cpu/sgl-kernels/moe_fp8.cpp b/csrc/cpu/sgl-kernels/moe_fp8.cpp
new file mode 100644
index 0000000000000..84a6af267740a
--- /dev/null
+++ b/csrc/cpu/sgl-kernels/moe_fp8.cpp
@@ -0,0 +1,502 @@
+// Adapted from
+// https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
+
+#include "common.h"
+#include "gemm.h"
+#include "vec.h"
+
+// clang-format off
+
+namespace {
+
+template <typename scalar_t>
+inline void copy_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t size) {
+  using Vec = at::vec::Vectorized<scalar_t>;
+  // no remainder
+  #pragma GCC unroll 4
+  for (int64_t d = 0; d < size; d += Vec::size()) {
+    Vec data = Vec::loadu(input + d);
+    data.store(out + d);
+  }
+}
+
+template <typename scalar_t>
+inline void copy_mul_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, float weight, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  const fVec weight_vec = fVec(weight);
+  int64_t d;
+  #pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    bVec x = bVec::loadu(input + d);
+    fVec x0, x1;
+    std::tie(x0, x1) = at::vec::convert_to_float(x);
+    x0 = x0 * weight_vec;
+    x1 = x1 * weight_vec;
+    bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] * weight);
+  }
+}
+
+// acc from [topk, K] to [K]
+template <typename scalar_t>
+inline void sum_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t topk, int64_t K) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  if (topk == 1) {
+    // do copy for topk = 1
+    copy_stub(out, input, K);
+  } else {
+    // do sum for topk != 1
+    int64_t d;
+    #pragma GCC unroll 4
+    for (d = 0; d <= K - kVecSize; d += kVecSize) {
+      fVec sum_fvec0 = fVec(0.f);
+      fVec sum_fvec1 = fVec(0.f);
+      for (int t = 0; t < topk; ++t) {
+        bVec x_bvec = bVec::loadu(input + t * K + d);
+        fVec x_fvec0, x_fvec1;
+        std::tie(x_fvec0, x_fvec1) = at::vec::convert_to_float(x_bvec);
+
+        sum_fvec0 += x_fvec0;
+        sum_fvec1 += x_fvec1;
+      }
+      bVec out_bvec = convert_from_float_ext<scalar_t>(sum_fvec0, sum_fvec1);
+      out_bvec.store(out + d);
+    }
+    for (; d < K; ++d) {
+      float sum_val = 0.f;
+      for (int t = 0; t < topk; ++t) {
+        sum_val += static_cast<float>(input[t * K + d]);
+      }
+      out[d] = static_cast<scalar_t>(sum_val);
+    }
+  }
+}
+
+// out = input + input2 * scale
+template <typename scalar_t>
+inline void add_mul_stub(
+    scalar_t* __restrict__ out,
+    const scalar_t* __restrict__ input,
+    const scalar_t* __restrict__ input2,
+    float scale,
+    int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  const fVec s_vec = fVec(scale);
+
+  int64_t d;
+#pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    bVec x_bvec = bVec::loadu(input + d);
+    fVec x0, x1;
+    std::tie(x0, x1) = at::vec::convert_to_float(x_bvec);
+
+    bVec y_bvec = bVec::loadu(input2 + d);
+    fVec y0, y1;
+    std::tie(y0, y1) = at::vec::convert_to_float(y_bvec);
+
+    x0 = x0 + y0 * s_vec;
+    x1 = x1 + y1 * s_vec;
+    bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] + float(input2[d]) * scale);
+  }
+}
+
+template <typename scalar_t>
+inline void silu_and_mul_stub(
+    scalar_t* __restrict__ out,
+    const scalar_t* __restrict__ input,
+    const scalar_t* __restrict__ input2,
+    int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  const fVec one = fVec(1.f);
+
+  // no remainder
+#pragma GCC unroll 4
+  for (int64_t d = 0; d < size; d += bVec::size()) {
+    bVec x = bVec::loadu(input + d);
+    fVec x0, x1;
+    std::tie(x0, x1) = at::vec::convert_to_float(x);
+    bVec y = bVec::loadu(input2 + d);
+    fVec y0, y1;
+    std::tie(y0, y1) = at::vec::convert_to_float(y);
+    x0 = x0 / (one + x0.neg().exp_u20());
+    x1 = x1 / (one + x1.neg().exp_u20());
+    x0 = x0 * y0;
+    x1 = x1 * y1;
+    bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
+    out_vec.store(out + d);
+  }
+}
+
+} // anonymous namespace
+
+template <typename scalar_t>
+void fused_experts_fp8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic0,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ ic2,
+    scalar_t* __restrict__ A_tmp,
+    scalar_t* __restrict__ B_tmp,
+    float* __restrict__ C_tmp,
+    const scalar_t* __restrict__ input,
+    const at::Float8_e4m3fn* __restrict__ packed_w1,
+    const at::Float8_e4m3fn* __restrict__ packed_w2,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    int64_t block_size_N,
+    int64_t block_size_K,
+    const float* __restrict__ topk_weights,
+    const int32_t* __restrict__ sorted_ids,
+    const int32_t* __restrict__ expert_ids,
+    const int32_t* __restrict__ offsets,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t E,
+    int64_t topk,
+    int64_t num_tokens_post_pad) {
+
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+
+  // stage 1: intermediate_cache0 = hidden_states @ w1
+  const int64_t MB = div_up(num_tokens_post_pad, BLOCK_M);
+  const int64_t NB = div_up(2 * N, BLOCK_N);
+  int64_t scale_size_N = div_up(2 * N, block_size_N);
+  int64_t scale_size_K = div_up(K, block_size_K);
+  int64_t blocks_n_per_group = block_size_N / BLOCK_N;
+
+  const int64_t stride_e = 2 * N * K;
+  const int64_t stride_n = K;
+
+  // here we only parallel on half of 2N to fuse silu_and_mul with gemm
+  at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
+    // get local pointers
+    int tid = at::get_thread_num();
+    scalar_t* __restrict__ A = A_tmp + tid * BLOCK_M * K;
+
+    bool is_brgemm_used = false;
+
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB;
+      int64_t nb = i % NB;
+
+      int64_t n_size = std::min(2 * N - nb * BLOCK_N, BLOCK_N);
+
+      // B shape [K, n_size] in vnni format
+      int32_t expert_id = expert_ids[mb];
+      const at::Float8_e4m3fn* __restrict__ B = packed_w1 + expert_id * stride_e + nb * BLOCK_N * stride_n;
+      const float* __restrict__ Bs = w1s + expert_id * scale_size_N * scale_size_K + (nb / blocks_n_per_group) * scale_size_K;
+
+      // 1.a load A
+      const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
+      int64_t m_size = offsets[mb + 1] - offsets[mb];
+
+      const bool use_brgemm = can_use_brgemm<at::Float8_e4m3fn>(m_size);
+      is_brgemm_used = is_brgemm_used || use_brgemm;
+
+      for (int64_t m = 0; m < m_size; ++m) {
+        int32_t index = A_ids[m] / topk;
+        copy_stub(A + m * K, input + index * K, K);
+      }
+
+      const int64_t offset = offsets[mb];
+      tinygemm_kernel<scalar_t>(
+          /*   A            */ A,
+          /*   B            */ B,
+          /*   C            */ ic0 + offset * 2 * N + nb * BLOCK_N,
+          /*   Btmp         */ B_tmp + tid * BLOCK_N * std::max(K, N),
+          /*   Ctmp         */ C_tmp + tid * 2 * BLOCK_M * BLOCK_N,
+          /*   scale        */ Bs,
+          /*   M            */ m_size,
+          /*   N            */ n_size,
+          /*   K            */ K,
+          /*   lda          */ K,
+          /*   ldb          */ n_size,
+          /*   ldc          */ 2 * N,
+          /*   brg          */ use_brgemm,
+          /*   block_size_K */ block_size_K);
+    }
+
+    if (is_brgemm_used) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+
+  // stage 1.5: intermediate_cache1 = silu(intermediate_cache0)
+  at::parallel_for(0, M * topk, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      silu_and_mul_stub(
+          ic1 + m * N,
+          ic0 + m * 2 * N,
+          ic0 + m * 2 * N + N,
+          N);
+    }
+  });
+
+  // stage 2: intermediate_cache2 = intermediate_cache1 @ w2
+  //   w2 : [E, K, N] as [E, OC, IC]
+  const int64_t OC = K;  // rename K as OC
+  const int64_t IC = N;  // rename N as IC
+  const int64_t MB2 = MB;
+  const int64_t NB2 = div_up(OC, BLOCK_N);
+  scale_size_N = div_up(K, block_size_N);
+  scale_size_K = div_up(N, block_size_K);
+  const int64_t stride_e2 = OC * IC;
+  const int64_t stride_oc = IC;
+
+  // parallel on [MB2, NB2]
+  at::parallel_for(0, MB2 * NB2, 0, [&](int64_t begin, int64_t end) {
+    int tid = at::get_thread_num();
+    alignas(64) scalar_t C[BLOCK_M * BLOCK_K];
+
+    bool is_brgemm_used = false;
+
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB2;
+      int64_t nb = i % NB2;
+
+      int64_t m_size = offsets[mb + 1] - offsets[mb];
+      int64_t n_size = std::min(OC - nb * BLOCK_N, BLOCK_N);
+
+      const bool use_brgemm = can_use_brgemm<at::Float8_e4m3fn>(m_size);
+      is_brgemm_used = is_brgemm_used || use_brgemm;
+
+      // A ptr from ic1 of [M * topk, N] in sorted order
+      // so as to avoid copy A to tmp buffer again
+      const scalar_t* __restrict__ A = ic1 + offsets[mb] * N;
+      const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
+
+      // B shape [IC, n_size] in vnni format
+      int32_t expert_id = expert_ids[mb];
+      const at::Float8_e4m3fn* __restrict__ B = packed_w2 + expert_id * stride_e2 + nb * BLOCK_N * stride_oc;
+      const float* __restrict__ Bs = w2s + expert_id * scale_size_N * scale_size_K + (nb / blocks_n_per_group) * scale_size_K;
+
+      tinygemm_kernel<scalar_t>(
+          /*   A            */ A,
+          /*   B            */ B,
+          /*   C            */ C,
+          /*   Btmp         */ B_tmp + tid * BLOCK_N * std::max(K, N),
+          /*   Ctmp         */ C_tmp + tid * 2 * BLOCK_M * BLOCK_N,
+          /*   scale        */ Bs,
+          /*   M            */ m_size,
+          /*   N            */ n_size,
+          /*   K            */ IC,
+          /*   lda          */ IC,
+          /*   ldb          */ n_size,
+          /*   ldc          */ BLOCK_N,
+          /*   brg          */ use_brgemm,
+          /*   block_size_K */ block_size_K);
+
+      // 2.b copy from C to ic2 in original order
+      //   and also mul topk_weights in float32
+      for (int64_t m = 0; m < m_size; ++m) {
+        int32_t index = A_ids[m];
+        float weight = topk_weights[index];
+        copy_mul_stub(ic2 + index * K + nb * BLOCK_N, C + m * BLOCK_N, weight, n_size);
+      }
+    }
+
+    if (is_brgemm_used) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+
+  // stage 3: out = intermediate_cache2.sum(dim=1)
+  //   from [M, topk, K] to [M, K]
+  at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      sum_stub(output + m * K, ic2 + m * topk * K, topk, K);
+    }
+  });
+}
+
+#define INSTANTIATE_MOE_FP8_TEMPLATE(TYPE)             \
+  template void fused_experts_fp8_kernel_impl<TYPE>(   \
+      TYPE* __restrict__ output,                       \
+      TYPE* __restrict__ ic0,                          \
+      TYPE* __restrict__ ic1,                          \
+      TYPE* __restrict__ ic2,                          \
+      TYPE* __restrict__ A_tmp,                        \
+      TYPE* __restrict__ B_tmp,                        \
+      float* __restrict__ C_tmp,                       \
+      const TYPE* __restrict__ input,                  \
+      const at::Float8_e4m3fn* __restrict__ packed_w1, \
+      const at::Float8_e4m3fn* __restrict__ packed_w2, \
+      const float* __restrict__ w1s,                   \
+      const float* __restrict__ w2s,                   \
+      int64_t block_size_N,                            \
+      int64_t block_size_K,                            \
+      const float* __restrict__ topk_weights,          \
+      const int32_t* __restrict__ sorted_ids,          \
+      const int32_t* __restrict__ expert_ids,          \
+      const int32_t* __restrict__ offsets,             \
+      int64_t M,                                       \
+      int64_t N,                                       \
+      int64_t K,                                       \
+      int64_t E,                                       \
+      int64_t topk,                                    \
+      int64_t num_tokens_post_pad)
+
+INSTANTIATE_MOE_FP8_TEMPLATE(at::BFloat16);
+INSTANTIATE_MOE_FP8_TEMPLATE(at::Half);
+
+template <typename scalar_t>
+void shared_expert_fp8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic0,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ B_tmp,
+    float* __restrict__ C_tmp,
+    const scalar_t* __restrict__ input,
+    const at::Float8_e4m3fn* __restrict__ packed_w1,
+    const at::Float8_e4m3fn* __restrict__ packed_w2,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    int64_t block_size_N,
+    int64_t block_size_K,
+    const scalar_t* __restrict__ fused_experts_out,
+    float routed_scaling_factor,
+    int64_t M,
+    int64_t N,
+    int64_t K) {
+
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+
+  // stage 1: intermediate_cache0 = hidden_states @ w1
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(2 * N, BLOCK_N);
+  int64_t scale_size_K = div_up(K, block_size_K);
+  int64_t blocks_n_per_group = block_size_N / BLOCK_N;
+
+  const bool use_brgemm = can_use_brgemm<at::Float8_e4m3fn>(M);
+
+  at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
+    int tid = at::get_thread_num();
+
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB;
+      int64_t nb = i % NB;
+      int64_t m_size = std::min(M - mb * BLOCK_M, BLOCK_M);
+      int64_t n_size = std::min(2 * N - nb * BLOCK_N, BLOCK_N);
+
+      tinygemm_kernel<scalar_t>(
+          /*   A            */ input + mb * BLOCK_M * K,
+          /*   B            */ packed_w1 + nb * BLOCK_N * K,
+          /*   C            */ ic0 + mb * BLOCK_M * 2 * N + nb * BLOCK_N,
+          /*   Btmp         */ B_tmp + tid * BLOCK_N * std::max(K, N),
+          /*   Ctmp         */ C_tmp + tid * 2 * BLOCK_M * BLOCK_N,
+          /*   scale        */ w1s + (nb / blocks_n_per_group) * scale_size_K,
+          /*   M            */ m_size,
+          /*   N            */ n_size,
+          /*   K            */ K,
+          /*   lda          */ K,
+          /*   ldb          */ n_size,
+          /*   ldc          */ 2 * N,
+          /*   brg          */ use_brgemm,
+          /*   block_size_K */ block_size_K);
+    }
+
+    if (use_brgemm) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+
+  // stage 1.5: intermediate_cache1 = silu(intermediate_cache0)
+  at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      silu_and_mul_stub(
+          ic1 + m * N,
+          ic0 + m * 2 * N,
+          ic0 + m * 2 * N + N,
+          N);
+    }
+  });
+
+  // stage 2: intermediate_cache2 = intermediate_cache1 @ w2
+  //   w2 : [K, N] as [OC, IC]
+  const int64_t OC = K;  // rename K as OC
+  const int64_t IC = N;  // rename N as IC
+  const int64_t MB2 = MB;
+  const int64_t NB2 = div_up(K, BLOCK_N);
+  scale_size_K = div_up(N, block_size_K);
+
+  // parallel on [MB2, NB2]
+  at::parallel_for(0, MB2 * NB2, 0, [&](int64_t begin, int64_t end) {
+    int tid = at::get_thread_num();
+    alignas(64) scalar_t C[BLOCK_M * BLOCK_K];
+
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB2;
+      int64_t nb = i % NB2;
+      int64_t m_size = std::min(M - mb * BLOCK_M, BLOCK_M);
+      int64_t n_size = std::min(OC - nb * BLOCK_N, BLOCK_N);
+
+      // 2.a gemm: C = A @ B
+      tinygemm_kernel<scalar_t>(
+          /*   A            */ ic1 + mb * BLOCK_M * N,
+          /*   B            */ packed_w2 + nb * BLOCK_N * N,
+          /*   C            */ C,
+          /*   Btmp         */ B_tmp + tid * BLOCK_N * std::max(K, N),
+          /*   Ctmp         */ C_tmp + tid * 2 * BLOCK_M * BLOCK_N,
+          /*   scale        */ w2s + (nb / blocks_n_per_group) * scale_size_K,
+          /*   M            */ m_size,
+          /*   N            */ n_size,
+          /*   K            */ IC,
+          /*   lda          */ IC,
+          /*   ldb          */ n_size,
+          /*   ldc          */ BLOCK_N,
+          /*   brg          */ use_brgemm,
+          /*   block_size_K */ block_size_K);
+
+      // 2.b copy from C to output and add fused_experts_out
+      scalar_t* __restrict__ out = output + mb * BLOCK_M * K + nb * BLOCK_N;
+      const scalar_t* __restrict__ fused_out = fused_experts_out + mb * BLOCK_M * K + nb * BLOCK_N;
+      for (int64_t m = 0; m < m_size; ++m) {
+        add_mul_stub(out + m * K, C + m * BLOCK_N, fused_out + m * K, routed_scaling_factor, n_size);
+      }
+    }
+  });
+
+  if (use_brgemm) {
+    at::native::cpublas::brgemm_release();
+  }
+}
+
+#define INSTANTIATE_SHARED_EXPERT_FP8_TEMPLATE(TYPE)   \
+  template void shared_expert_fp8_kernel_impl<TYPE>(   \
+      TYPE* __restrict__ output,                       \
+      TYPE* __restrict__ ic0,                          \
+      TYPE* __restrict__ ic1,                          \
+      TYPE* __restrict__ B_tmp,                        \
+      float* __restrict__ C_tmp,                       \
+      const TYPE* __restrict__ input,                  \
+      const at::Float8_e4m3fn* __restrict__ packed_w1, \
+      const at::Float8_e4m3fn* __restrict__ packed_w2, \
+      const float* __restrict__ w1s,                   \
+      const float* __restrict__ w2s,                   \
+      int64_t block_size_N,                            \
+      int64_t block_size_K,                            \
+      const TYPE* __restrict__ fused_experts_out,      \
+      float routed_scaling_factor,                     \
+      int64_t M,                                       \
+      int64_t N,                                       \
+      int64_t K)
+
+INSTANTIATE_SHARED_EXPERT_FP8_TEMPLATE(at::BFloat16);
+INSTANTIATE_SHARED_EXPERT_FP8_TEMPLATE(at::Half);
diff --git a/csrc/cpu/sgl-kernels/moe_int8.cpp b/csrc/cpu/sgl-kernels/moe_int8.cpp
new file mode 100644
index 0000000000000..89d0fb5d9f3b7
--- /dev/null
+++ b/csrc/cpu/sgl-kernels/moe_int8.cpp
@@ -0,0 +1,769 @@
+// Adapted from
+// https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
+
+#include "common.h"
+#include "vec.h"
+#include "gemm.h"
+
+// clang-format off
+
+namespace {
+
+template <typename scalar_t>
+inline void copy_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t size) {
+  using Vec = at::vec::Vectorized<scalar_t>;
+  // no remainder
+  #pragma GCC unroll 4
+  for (int64_t d = 0; d < size; d += Vec::size()) {
+    Vec data = Vec::loadu(input + d);
+    data.store(out + d);
+  }
+}
+
+template <>
+inline void copy_stub<uint8_t>(uint8_t* __restrict__ out, const uint8_t* __restrict__ input, int64_t size) {
+  // size might be 64x + 32
+  std::memcpy(out, input, size * sizeof(uint8_t));
+}
+
+template <typename scalar_t>
+inline void copy_mul_stub(scalar_t* __restrict__ out, const float* __restrict__ input, float weight, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  const fVec weight_vec = fVec(weight);
+  int64_t d;
+  #pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec data0 = fVec::loadu(input + d) * weight_vec;
+    fVec data1 = fVec::loadu(input + d + fVec::size()) * weight_vec;
+    bVec out_vec = convert_from_float_ext<scalar_t>(data0, data1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] * weight);
+  }
+}
+
+// acc from [topk, K] to [K]
+template <typename scalar_t>
+inline void sum_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t topk, int64_t K) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  if (topk == 1) {
+    // do copy for topk = 1
+    copy_stub(out, input, K);
+  } else {
+    // do sum for topk != 1
+    int64_t d;
+    #pragma GCC unroll 4
+    for (d = 0; d <= K - kVecSize; d += kVecSize) {
+      fVec sum_fvec0 = fVec(0.f);
+      fVec sum_fvec1 = fVec(0.f);
+      for (int t = 0; t < topk; ++t) {
+        bVec x_bvec = bVec::loadu(input + t * K + d);
+        fVec x_fvec0, x_fvec1;
+        std::tie(x_fvec0, x_fvec1) = at::vec::convert_to_float(x_bvec);
+
+        sum_fvec0 += x_fvec0;
+        sum_fvec1 += x_fvec1;
+      }
+      bVec out_bvec = convert_from_float_ext<scalar_t>(sum_fvec0, sum_fvec1);
+      out_bvec.store(out + d);
+    }
+    for (; d < K; ++d) {
+      float sum_val = 0.f;
+      for (int t = 0; t < topk; ++t) {
+        sum_val += static_cast<float>(input[t * K + d]);
+      }
+      out[d] = static_cast<scalar_t>(sum_val);
+    }
+  }
+}
+
+// out = input + input2 * scale
+template <typename scalar_t>
+inline void add_mul_stub(scalar_t* __restrict__ out, const float* __restrict__ input,
+    const scalar_t* __restrict__ input2, float scale, int64_t size) {
+
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  const fVec s_vec = fVec(scale);
+  int64_t d;
+  #pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec x0 = fVec::loadu(input + d);
+    fVec x1 = fVec::loadu(input + d + fVec::size());
+
+    bVec y_bvec = bVec::loadu(input2 + d);
+    fVec y0, y1;
+    std::tie(y0, y1) = at::vec::convert_to_float(y_bvec);
+
+    x0 = x0 + y0 * s_vec;
+    x1 = x1 + y1 * s_vec;
+    bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] + float(input2[d]) * scale);
+  }
+}
+
+/// gemm for w13
+template <typename scalar_t, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_vnni {
+  static inline void apply(
+      const uint8_t* __restrict__ A, const int8_t* __restrict__ B0, const int8_t* __restrict__ B1, scalar_t* __restrict__ C,
+      const float* __restrict__ As, const float* __restrict__ Bs0, const float* __restrict__ Bs1,
+      const int32_t* __restrict__ Bcomp0, const int32_t* __restrict__ Bcomp1,
+      int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+    TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_vnni<at::BFloat16, BLOCK_M, BLOCK_N> {
+  static inline void apply(
+      const uint8_t* __restrict__ A, const int8_t* __restrict__ B0, const int8_t* __restrict__ B1, at::BFloat16* __restrict__ C,
+      const float* __restrict__ As, const float* __restrict__ Bs0, const float* __restrict__ Bs1,
+      const int32_t* __restrict__ Bcomp0, const int32_t* __restrict__ Bcomp1,
+      int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+
+    constexpr int ROWS = BLOCK_M;
+    constexpr int COLS = BLOCK_N / 16;
+    static_assert(COLS % 2 == 0);
+
+    __m512i va;
+    __m512i vb0[COLS];
+    __m512i vb1[COLS];
+    __m512i vc0[ROWS * COLS];
+    __m512i vc1[ROWS * COLS];
+    __m512i vcomp0[COLS];
+    __m512i vcomp1[COLS];
+    __m512  was;
+    __m512  vbs0[COLS];
+    __m512  vbs1[COLS];
+
+    auto loadc = [&](auto i) {
+      vc0[i] = _mm512_set1_epi32(0);
+      vc1[i] = _mm512_set1_epi32(0);
+    };
+    Unroll<ROWS * COLS>{}(loadc);
+
+    const int64_t K4 = K >> 2;
+    const int64_t lda4 = lda >> 2;
+    const int64_t ldb4 = ldb; // ldb * 4 >> 2;
+    const int32_t* a_ptr = reinterpret_cast<const int32_t*>(A);
+    const int32_t* b0_ptr = reinterpret_cast<const int32_t*>(B0);
+    const int32_t* b1_ptr = reinterpret_cast<const int32_t*>(B1);
+
+    auto compute = [&](auto i, int64_t k) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = _mm512_set1_epi32(a_ptr[row * lda4 + k]);
+      }
+      if constexpr (row == 0) {
+        vb0[col] = _mm512_loadu_si512(b0_ptr + k * ldb4 + col * 16);
+        vb1[col] = _mm512_loadu_si512(b1_ptr + k * ldb4 + col * 16);
+      }
+      vc0[i] = _mm512_dpbusd_epi32(vc0[i], va, vb0[col]);
+      vc1[i] = _mm512_dpbusd_epi32(vc1[i], va, vb1[col]);
+    };
+    for (int64_t k = 0; k < K4; ++k) {
+      Unroll<ROWS * COLS>{}(compute, k);
+    }
+
+    auto scalec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      // load a scale
+      if constexpr(col == 0) {
+        was = _mm512_set1_ps(As[row]);
+      }
+      // load b scale and vcomp
+      if constexpr (row == 0) {
+        vbs0[col] = _mm512_loadu_ps(Bs0 + col * 16);
+        vbs1[col] = _mm512_loadu_ps(Bs1 + col * 16);
+        vcomp0[col] = _mm512_loadu_si512(Bcomp0 + col * 16);
+        vcomp1[col] = _mm512_loadu_si512(Bcomp1 + col * 16);
+      }
+      __m512 c0 = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc0[i], vcomp0[col]));
+      __m512 c1 = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc1[i], vcomp1[col]));
+      vc0[i] = _mm512_castps_si512(_mm512_mul_ps(_mm512_mul_ps(c0, was), vbs0[col]));
+      vc1[i] = _mm512_castps_si512(_mm512_mul_ps(_mm512_mul_ps(c1, was), vbs1[col]));
+    };
+    Unroll<ROWS * COLS>{}(scalec);
+
+    using Vec = at::vec::Vectorized<float>;
+    const Vec one = Vec(1.f);
+    auto storec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+      // for COLS = 2, 4 use 512bit store
+      if constexpr (col % 2 == 0) {
+        Vec x0 = _mm512_castsi512_ps(vc0[row * COLS + col + 0]);
+        Vec x1 = _mm512_castsi512_ps(vc0[row * COLS + col + 1]);
+        Vec y0 = _mm512_castsi512_ps(vc1[row * COLS + col + 0]);
+        Vec y1 = _mm512_castsi512_ps(vc1[row * COLS + col + 1]);
+        // silu
+        x0 = x0 / (one + x0.neg().exp_u20());
+        x1 = x1 / (one + x1.neg().exp_u20());
+        // mul
+        x0 = x0 * y0;
+        x1 = x1 * y1;
+
+        _mm512_storeu_si512(
+            reinterpret_cast<__m512i*>((C + row * ldc + col * 16)),
+            (__m512i)(_mm512_cvtne2ps_pbh(__m512(x1), __m512(x0))));
+        }
+    };
+    Unroll<ROWS * COLS>{}(storec);
+  }
+};
+#endif
+
+#define LAUNCH_TINYGEMM_KERNEL_VNNI(MB_SIZE, NB_SIZE)                        \
+    tinygemm_kernel_vnni<scalar_t, MB_SIZE, NB_SIZE>::apply(                 \
+        A + mb_start * lda, B0 + nb_start * 4, B1 + nb_start * 4,            \
+        C + mb_start * ldc + nb_start, As + mb_start,                        \
+        Bs0 + nb_start, Bs1 + nb_start, Bcomp0 + nb_start, Bcomp1 + nb_start,\
+        K, lda, ldb, ldc);
+
+template <typename scalar_t>
+void tinygemm_kernel(
+    const uint8_t* __restrict__ A,
+    const int8_t* __restrict__ B0,
+    const int8_t* __restrict__ B1,
+    scalar_t* __restrict__ C,
+    const float* __restrict__ As,
+    const float* __restrict__ Bs0,
+    const float* __restrict__ Bs1,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc) {
+
+  const int32_t* Bcomp0 = reinterpret_cast<const int32_t*>(B0 + block_size_n() * K);
+  const int32_t* Bcomp1 = reinterpret_cast<const int32_t*>(B1 + block_size_n() * K);
+
+  // pattern: 1-(2+2)-(8+8)
+  constexpr int64_t BLOCK_M = 4;
+  constexpr int64_t BLOCK_N = 32;
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+  for (int mb = 0; mb < MB; ++mb) {
+    int64_t mb_start = mb * BLOCK_M;
+    int64_t mb_size = std::min(BLOCK_M, M - mb_start);
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch(mb_size << 4 | nb_size >> 4) {
+        case 0x12: LAUNCH_TINYGEMM_KERNEL_VNNI(1, 32); break;
+        case 0x22: LAUNCH_TINYGEMM_KERNEL_VNNI(2, 32); break;
+        case 0x32: LAUNCH_TINYGEMM_KERNEL_VNNI(3, 32); break;
+        case 0x42: LAUNCH_TINYGEMM_KERNEL_VNNI(4, 32); break;
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+      }
+    }
+  }
+}
+
+/// gemm for w2
+template <typename scalar_t, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_vnni2 {
+  static inline void apply(
+      const uint8_t* __restrict__ A, const int8_t* __restrict__ B, float* __restrict__ C,
+      const float* __restrict__ As, const float* __restrict__ Bs, const int32_t* __restrict__ Bcomp,
+      int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+    TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_vnni2<at::BFloat16, BLOCK_M, BLOCK_N> {
+  static inline void apply(
+      const uint8_t* __restrict__ A, const int8_t* __restrict__ B, float* __restrict__ C,
+      const float* __restrict__ As, const float* __restrict__ Bs, const int32_t* __restrict__ Bcomp,
+      int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+
+    constexpr int ROWS = BLOCK_M;
+    constexpr int COLS = BLOCK_N / 16;
+    static_assert(COLS % 2 == 0);
+
+    __m512i va;
+    __m512i vb[COLS];
+    __m512i vc[ROWS * COLS];
+    __m512i vcomp[COLS];
+    __m512  was;
+    __m512  vbs[COLS];
+
+    auto loadc = [&](auto i) {
+      vc[i] = _mm512_set1_epi32(0);
+    };
+    Unroll<ROWS * COLS>{}(loadc);
+
+    const int64_t K4 = K >> 2;
+    const int64_t lda4 = lda >> 2;
+    const int64_t ldb4 = ldb; // ldb * 4 >> 2;
+    const int32_t* a_ptr = reinterpret_cast<const int32_t*>(A);
+    const int32_t* b_ptr = reinterpret_cast<const int32_t*>(B);
+
+    auto compute = [&](auto i, int64_t k) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = _mm512_set1_epi32(a_ptr[row * lda4 + k]);
+      }
+      if constexpr (row == 0) {
+        vb[col] = _mm512_loadu_si512(b_ptr + k * ldb4 + col * 16);
+      }
+      vc[i] = _mm512_dpbusd_epi32(vc[i], va, vb[col]);
+    };
+    for (int64_t k = 0; k < K4; ++k) {
+      Unroll<ROWS * COLS>{}(compute, k);
+    }
+
+    auto storec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      // load a scale
+      if constexpr(col == 0) {
+        was = _mm512_set1_ps(As[row]);
+      }
+      // load b scale and vcomp per 2 vectors
+      // also load bias if any
+      if constexpr (row == 0) {
+        if constexpr (col % 2 == 0) {
+          vbs[col + 0] = _mm512_loadu_ps(Bs + col * 16);
+          vbs[col + 1] = _mm512_loadu_ps(Bs + col * 16 + 16);
+          vcomp[col + 0] = _mm512_loadu_si512(Bcomp + col * 16);
+          vcomp[col + 1] = _mm512_loadu_si512(Bcomp + col * 16 + 16);
+        }
+      }
+      __m512 x = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc[i], vcomp[col]));
+      x = _mm512_mul_ps(_mm512_mul_ps(x, was), vbs[col]);
+      _mm512_storeu_ps(reinterpret_cast<__m512*>(C + row * ldc + col * 16), x);
+    };
+    Unroll<ROWS * COLS>{}(storec);
+  }
+};
+#endif
+
+#define LAUNCH_TINYGEMM_KERNEL_VNNI2(MB_SIZE, NB_SIZE)                       \
+    tinygemm_kernel_vnni2<scalar_t, MB_SIZE, NB_SIZE>::apply(                \
+        A + mb_start * lda, B + nb_start * 4, C + mb_start * ldc + nb_start, \
+        As + mb_start, Bs + nb_start, Bcomp + nb_start,                      \
+        K, lda, ldb, ldc);
+
+template <typename scalar_t>
+void tinygemm_kernel(
+    const uint8_t* __restrict__ A,
+    const int8_t* __restrict__ B,
+    float* __restrict__ C,
+    const float* __restrict__ As,
+    const float* __restrict__ Bs,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc) {
+
+  // B compensation
+  const int32_t* Bcomp = reinterpret_cast<const int32_t*>(B + block_size_n() * K);
+
+  // pattern: 1-4-16
+  constexpr int64_t BLOCK_M = 4;
+  constexpr int64_t BLOCK_N = 64;
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+  for (int64_t mb = 0; mb < MB; ++mb) {
+    int64_t mb_start = mb * BLOCK_M;
+    int64_t mb_size = std::min(BLOCK_M, M - mb_start);
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch(mb_size << 4 | nb_size >> 4) {
+        case 0x12: LAUNCH_TINYGEMM_KERNEL_VNNI2(1, 32); break;
+        case 0x22: LAUNCH_TINYGEMM_KERNEL_VNNI2(2, 32); break;
+        case 0x32: LAUNCH_TINYGEMM_KERNEL_VNNI2(3, 32); break;
+        case 0x42: LAUNCH_TINYGEMM_KERNEL_VNNI2(4, 32); break;
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+      }
+    }
+  }
+}
+
+} // anonymous namespace
+
+template <typename scalar_t>
+void fused_experts_int8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ ic2,
+    uint8_t* __restrict__ A_tmp,
+    float* __restrict__ C_tmp,
+    uint8_t* __restrict__ Aq_tmp,
+    float* __restrict__ As_tmp,
+    const scalar_t* __restrict__ input,
+    const int8_t* __restrict__ packed_w1,
+    const int8_t* __restrict__ packed_w2,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    const float* __restrict__ topk_weights,
+    const int32_t* __restrict__ sorted_ids,
+    const int32_t* __restrict__ expert_ids,
+    const int32_t* __restrict__ offsets,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t E,
+    int64_t topk,
+    int64_t num_tokens_post_pad) {
+
+  // handle 2 tiles per block
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+
+  // stage 0: quantize input to uint8, [M, K]
+  at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      quantize_row_int8<scalar_t>(
+          Aq_tmp + m * K,
+          As_tmp[m],
+          input + m * K,
+          K);
+    }
+  });
+
+  // stage 1: intermediate_cache1 = silu(hidden_states @ w1)
+  const int64_t MB = div_up(num_tokens_post_pad, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  // strides for w1: [E, 2N, K]
+  TORCH_CHECK(N % BLOCK_N == 0, "Fixme when N is not multiples of ", BLOCK_N);
+
+  // K and N are packed for int8
+  const int64_t packed_K = get_row_size<int8_t>(K);
+  const int64_t packed_N = get_row_size<int8_t>(N);
+
+  const int64_t stride_e = 2 * N * packed_K;
+  const int64_t stride_n = packed_K;
+  // here we only parallel on half of 2N to fuse silu_and_mul with gemm
+  at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
+    // get local pointers
+    int tid = at::get_thread_num();
+    uint8_t* __restrict__ A = A_tmp + tid * BLOCK_M * K;
+
+    alignas(64) float As[BLOCK_M];
+
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB;
+      int64_t nb = i % NB;
+
+      // nb0 from top half and nb1 from bottom half
+      int64_t nb0 = nb, nb1 = nb + NB;
+      int64_t n_size = std::min(N - nb0 * BLOCK_N, BLOCK_N);
+
+      // B shape [K, n_size] in vnni format
+      int32_t expert_id = expert_ids[mb];
+      const int8_t* __restrict__ B0 = packed_w1 + expert_id * stride_e + nb0 * BLOCK_N * stride_n;
+      const int8_t* __restrict__ B1 = packed_w1 + expert_id * stride_e + nb1 * BLOCK_N * stride_n;
+      const float* __restrict__ Bs0 = w1s + expert_id * 2 * N + nb0 * BLOCK_N;
+      const float* __restrict__ Bs1 = w1s + expert_id * 2 * N + nb1 * BLOCK_N;
+
+      // 1.a load A
+      const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
+      int64_t m_size = offsets[mb + 1] - offsets[mb];
+
+      for (int64_t m = 0; m < m_size; ++m) {
+        int32_t index = A_ids[m] / topk;
+        copy_stub(A + m * K, Aq_tmp + index * K, K);
+        As[m] = As_tmp[index];
+      }
+
+      // fused 1.b: silu_and_mul(A @ B0, A @ B1)
+      const int64_t offset = offsets[mb];
+      tinygemm_kernel(
+          /* A     */ A,
+          /* B0    */ B0,
+          /* B1    */ B1,
+          /* C     */ ic1 + offset * N + nb * BLOCK_N,
+          /* As    */ As,
+          /* Bs0   */ Bs0,
+          /* Bs1   */ Bs1,
+          /* M     */ m_size,
+          /* N     */ n_size,
+          /* K     */ K,
+          /* lda   */ K,
+          /* ldb   */ n_size,
+          /* ldc   */ N);
+    }
+  });
+
+  // stage 1.5: quantize ic1 to uint8, [M * topk, N]
+  at::parallel_for(0, M * topk, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      quantize_row_int8<scalar_t>(
+          Aq_tmp + m * N,
+          As_tmp[m],
+          ic1 + m * N,
+          N);
+    }
+  });
+
+  // stage 2: intermediate_cache2 = intermediate_cache1 @ w2
+  //   w2 : [E, K, N] as [E, OC, IC]
+  const int64_t OC = K;  // rename K as OC
+  const int64_t IC = N;  // rename N as IC
+  const int64_t MB2 = MB;
+  const int64_t NB2 = div_up(OC, BLOCK_N);
+  const int64_t stride_e2 = OC * packed_N;
+  const int64_t stride_oc = packed_N;
+
+  // parallel on [MB2, NB2]
+  at::parallel_for(0, MB2 * NB2, 0, [&](int64_t begin, int64_t end) {
+    // get local pointers
+    int tid = at::get_thread_num();
+    // we won't be using C1 for gemm2
+    float* __restrict__ C = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
+
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB2;
+      int64_t nb = i % NB2;
+
+      int64_t m_size = offsets[mb + 1] - offsets[mb];
+      int64_t n_size = std::min(OC - nb * BLOCK_N, BLOCK_N);
+
+      // A ptr from ic1 of [M * topk, N] in sorted order
+      // so as to avoid copy A to tmp buffer again
+      const uint8_t* __restrict__ A = Aq_tmp + offsets[mb] * N;
+      const float* __restrict__ As = As_tmp + offsets[mb];
+      const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
+
+      // B shape [IC, n_size] in vnni format
+      int32_t expert_id = expert_ids[mb];
+      const int8_t* __restrict__ B = packed_w2 + expert_id * stride_e2 + nb * BLOCK_N * stride_oc;
+      const float* __restrict__ Bs = w2s + expert_id * K + nb * BLOCK_N;
+
+      // 2.a gemm: C = A @ B
+      tinygemm_kernel<scalar_t>(
+          /* A     */ A,
+          /* B     */ B,
+          /* C     */ C,
+          /* As    */ As,
+          /* Bs    */ Bs,
+          /* M     */ m_size,
+          /* N     */ n_size,
+          /* K     */ IC,
+          /* lda   */ IC,
+          /* ldb   */ n_size,
+          /* ldc   */ BLOCK_N);
+
+      // 2.b copy from C to ic2 in original order
+      //   and also mul topk_weights in float32
+      for (int64_t m = 0; m < m_size; ++m) {
+        int32_t index = A_ids[m];
+        float weight = topk_weights[index];
+        copy_mul_stub(ic2 + index * K + nb * BLOCK_N, C + m * BLOCK_N, weight, n_size);
+      }
+    }
+  });
+
+  // stage 3: out = intermediate_cache2.sum(dim=1)
+  //   from [M, topk, K] to [M, K]
+  at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      sum_stub(output + m * K, ic2 + m * topk * K, topk, K);
+    }
+  });
+}
+
+#define INSTANTIATE_MOE_INT8_TEMPLATE(TYPE)                                                  \
+  template void fused_experts_int8_kernel_impl<TYPE> (                                       \
+      TYPE* __restrict__ output, TYPE* __restrict__ ic1,                                     \
+      TYPE* __restrict__ ic2, uint8_t* __restrict__ A_tmp,                                   \
+      float* __restrict__ C_tmp, uint8_t* __restrict__ Aq_tmp,                               \
+      float* __restrict__ As_tmp, const TYPE* __restrict__ input,                            \
+      const int8_t* __restrict__ packed_w1, const int8_t* __restrict__ packed_w2,            \
+      const float* __restrict__ w1s, const float* __restrict__ w2s,                          \
+      const float* __restrict__ topk_weights, const int32_t* __restrict__ sorted_ids,        \
+      const int32_t* __restrict__ expert_ids, const int32_t* __restrict__ offsets,           \
+      int64_t M, int64_t N, int64_t K, int64_t E, int64_t topk, int64_t num_tokens_post_pad)
+
+INSTANTIATE_MOE_INT8_TEMPLATE(at::BFloat16);
+INSTANTIATE_MOE_INT8_TEMPLATE(at::Half);
+
+template <typename scalar_t>
+void shared_expert_int8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic1,
+    float* __restrict__ C_tmp,
+    uint8_t* __restrict__ Aq_tmp,
+    float* __restrict__ As_tmp,
+    const scalar_t* __restrict__ input,
+    const int8_t* __restrict__ packed_w1,
+    const int8_t* __restrict__ packed_w2,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    const scalar_t* __restrict__ fused_experts_out,
+    float routed_scaling_factor,
+    int64_t M,
+    int64_t N,
+    int64_t K) {
+
+  // handle 2 tiles per block
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+
+  // stage 0: quantize input to uint8, [M, K]
+  at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      quantize_row_int8<scalar_t>(
+          Aq_tmp + m * K,
+          As_tmp[m],
+          input + m * K,
+          K);
+    }
+  });
+
+   // stage 1: intermediate_cache1 = silu(hidden_states @ w1)
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  TORCH_CHECK(N % BLOCK_N == 0, "Fixme when N is not multiples of ", BLOCK_N);
+
+  // K and N are packed for int8
+  const int64_t packed_K = get_row_size<int8_t>(K);
+  const int64_t packed_N = get_row_size<int8_t>(N);
+  const int64_t stride_n = packed_K;
+
+  // here we only parallel on half of 2N to fuse silu_and_mul with gemm
+  at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB;
+      int64_t nb = i % NB;
+
+      // nb0 from top half and nb1 from bottom half
+      int64_t nb0 = nb, nb1 = nb + NB;
+      int64_t n_size = std::min(N - nb0 * BLOCK_N, BLOCK_N);
+      int64_t m_size = std::min(M - mb * BLOCK_M, BLOCK_M);
+
+      // A shape [m_size, K]
+      const uint8_t* A = Aq_tmp + mb * BLOCK_M * K;
+      const float* As = As_tmp + mb * BLOCK_M;
+
+      // B shape [K, n_size] in vnni format
+      const int8_t* __restrict__ B0 = packed_w1 + nb0 * BLOCK_N * stride_n;
+      const int8_t* __restrict__ B1 = packed_w1 + nb1 * BLOCK_N * stride_n;
+      const float* __restrict__ Bs0 = w1s + nb0 * BLOCK_N;
+      const float* __restrict__ Bs1 = w1s + nb1 * BLOCK_N;
+
+      // fused 1.b: silu_and_mul(A @ B0, A @ B1)
+      tinygemm_kernel(
+          /* A     */ A,
+          /* B0    */ B0,
+          /* B1    */ B1,
+          /* C     */ ic1 + mb * BLOCK_M * N + nb * BLOCK_N,
+          /* As    */ As,
+          /* Bs0   */ Bs0,
+          /* Bs1   */ Bs1,
+          /* M     */ m_size,
+          /* N     */ n_size,
+          /* K     */ K,
+          /* lda   */ K,
+          /* ldb   */ n_size,
+          /* ldc   */ N);
+    }
+  });
+
+  // stage 1.5: quantize ic1 to uint8, [M * topk, N]
+  at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      quantize_row_int8<scalar_t>(
+          Aq_tmp + m * N,
+          As_tmp[m],
+          ic1 + m * N,
+          N);
+    }
+  });
+
+  // stage 2: intermediate_cache2 = intermediate_cache1 @ w2
+  //   w2 : [K, N] as [OC, IC]
+  const int64_t OC = K;  // rename K as OC
+  const int64_t IC = N;  // rename N as IC
+  const int64_t MB2 = MB;
+  const int64_t NB2 = div_up(OC, BLOCK_N);
+  const int64_t stride_oc = packed_N;
+
+  // parallel on [MB2, NB2]
+  at::parallel_for(0, MB2 * NB2, 0, [&](int64_t begin, int64_t end) {
+    // get local pointers
+    int tid = at::get_thread_num();
+    // we won't be using C1 for gemm2
+    float* __restrict__ C = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
+
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB2;
+      int64_t nb = i % NB2;
+
+      int64_t m_size = std::min(M - mb * BLOCK_M, BLOCK_M);
+      int64_t n_size = std::min(OC - nb * BLOCK_N, BLOCK_N);
+
+      // A shape [m_size, IC]
+      const uint8_t* __restrict__ A = Aq_tmp + mb * BLOCK_M * N;
+      const float* __restrict__ As = As_tmp + mb * BLOCK_M;
+
+      // B shape [IC, n_size] in vnni format
+      const int8_t* __restrict__ B = packed_w2 + nb * BLOCK_N * stride_oc;
+      const float* __restrict__ Bs = w2s + nb * BLOCK_N;
+
+      // 2.a gemm: C = A @ B
+      tinygemm_kernel<scalar_t>(
+          /* A     */ A,
+          /* B     */ B,
+          /* C     */ C,
+          /* As    */ As,
+          /* Bs    */ Bs,
+          /* M     */ m_size,
+          /* N     */ n_size,
+          /* K     */ IC,
+          /* lda   */ IC,
+          /* ldb   */ n_size,
+          /* ldc   */ BLOCK_N);
+
+      // 2.b copy from C to output and add fused_experts_out
+      scalar_t* __restrict__ out = output + mb * BLOCK_M * K + nb * BLOCK_N;
+      const scalar_t* __restrict__ fused_out = fused_experts_out + mb * BLOCK_M * K + nb * BLOCK_N;
+      for (int64_t m = 0; m < m_size; ++m) {
+        add_mul_stub(out + m * K, C + m * BLOCK_N, fused_out + m * K, routed_scaling_factor, n_size);
+      }
+    }
+  });
+}
+
+#define INSTANTIATE_SHARED_EXPERT_INT8_TEMPLATE(TYPE)                                        \
+  template void shared_expert_int8_kernel_impl<TYPE> (                                       \
+      TYPE* __restrict__ output, TYPE* __restrict__ ic1,                                     \
+      float* __restrict__ C_tmp, uint8_t* __restrict__ Aq_tmp,                               \
+      float* __restrict__ As_tmp, const TYPE* __restrict__ input,                            \
+      const int8_t* __restrict__ packed_w1, const int8_t* __restrict__ packed_w2,            \
+      const float* __restrict__ w1s, const float* __restrict__ w2s,                          \
+      const TYPE* __restrict__ fused_experts_out, float routed_scaling_factor,               \
+      int64_t M, int64_t N, int64_t K)
+
+INSTANTIATE_SHARED_EXPERT_INT8_TEMPLATE(at::BFloat16);
+INSTANTIATE_SHARED_EXPERT_INT8_TEMPLATE(at::Half);
diff --git a/csrc/cpu/sgl-kernels/vec.h b/csrc/cpu/sgl-kernels/vec.h
new file mode 100644
index 0000000000000..87955cfb2922c
--- /dev/null
+++ b/csrc/cpu/sgl-kernels/vec.h
@@ -0,0 +1,308 @@
+// Adapted from
+// https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
+
+#pragma once
+
+// clang-format off
+
+#if defined(__AVX512F__) && defined(__AVX512BF16__) && defined(__AMX_BF16__)
+#define CPU_CAPABILITY_AVX512
+#endif
+
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+
+namespace {
+
+using namespace at::vec;
+
+template <typename scalar_t,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline Vectorized<scalar_t> convert_from_float_ext(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return at::vec::convert_from_float<scalar_t>(a, b);
+}
+
+#if defined(CPU_CAPABILITY_AVX512)
+
+// `at::vec::convert_from_float<>` from PyTorch doesn't have avx512-bf16 intrinsics
+// use native instruction for bfloat16->float32 conversion
+template <>
+inline Vectorized<at::BFloat16> convert_from_float_ext<at::BFloat16>(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return (__m512i)(_mm512_cvtne2ps_pbh(__m512(b), __m512(a)));
+}
+
+#define CVT_BF16_TO_FP32(a) \
+    _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(a), 16))
+
+#define CVT_FP16_TO_FP32(a) \
+    _mm512_cvtps_ph(a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))
+
+// this doesn't hanel NaN.
+inline __m512bh cvt_e4m3_bf16_intrinsic_no_nan(__m256i fp8_vec) {
+  const __m512i x = _mm512_cvtepu8_epi16(fp8_vec);
+
+  const __m512i mant = _mm512_slli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(0x07)), 4);
+  const __m512i raw_exp = _mm512_srli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(0x78)), 3);
+  const __m512i exp = _mm512_slli_epi16(_mm512_add_epi16(raw_exp, _mm512_set1_epi16(120)), 7);
+  const __m512i nonsign = _mm512_or_si512(exp, mant);
+
+  const __m512i sign = _mm512_slli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(0x80)), 8);
+  const __m512i combined = _mm512_or_si512(nonsign, sign);
+
+  const __mmask32 is_nonzero = _mm512_cmpneq_epi16_mask(x, _mm512_setzero_si512());
+  return (__m512bh)_mm512_maskz_mov_epi16(is_nonzero, combined);
+}
+
+inline __m512bh cvt_e4m3_bf16_intrinsic_without_denorm(__m256i fp8_vec) {
+  // The following conversion is without denorm behavior, that is to say,
+  //   Max subnorm   : S.0000.111 = 0.875 ∗ 2**(−6)
+  //   Min subnorm   : S.0000.001 = 2**(−9)
+  // 0.0019 ~ 0.0137 cannot be converted correctly.
+  __m512i x = _mm512_cvtepu8_epi16(fp8_vec);
+  auto mask = _mm512_cmpneq_epi16_mask(
+      _mm512_and_si512(x, _mm512_set1_epi16(127)),
+      _mm512_setzero_si512());  // mask = x & 0x7f
+  auto mask_nan = _mm512_cmpneq_epi16_mask(
+      _mm512_and_si512(x, _mm512_set1_epi16(127)),
+      _mm512_set1_epi16(127));                                                      // mask_nan = x & 0x7f
+  auto mantissa = _mm512_slli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(7)), 4);  // mantissa = (x & 7) << 4
+  auto exponent = _mm512_add_epi16(
+      _mm512_srli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(120)), 3),
+      _mm512_set1_epi16(120));  // exponent = (((x >> 3) & 15) + 120)
+  auto nonsign = _mm512_maskz_mov_epi16(mask, _mm512_or_si512(mantissa, _mm512_slli_epi16(exponent, 7)));
+  nonsign = _mm512_mask_mov_epi16(_mm512_set1_epi16(0x7fff), mask_nan, nonsign);  // deal with Nan
+  return (__m512bh)(_mm512_or_si512(
+      nonsign,
+      _mm512_slli_epi16(
+          _mm512_and_si512(x, _mm512_set1_epi16(128)),
+          8)));  // add sign (x & 128) << 8
+}
+
+inline __m512bh cvt_e4m3_bf16_intrinsic_with_denorm(__m256i fp8_vec) {
+  __m512i x = _mm512_cvtepu8_epi16(fp8_vec);
+  __m512i lg2mant = _mm512_mask_mov_epi16(
+      _mm512_mask_mov_epi16(
+          _mm512_setzero_si512(), _mm512_test_epi16_mask(x, _mm512_set1_epi16(2)), _mm512_set1_epi16(1)),
+      _mm512_test_epi16_mask(x, _mm512_set1_epi16(4)),
+      _mm512_set1_epi16(2));
+  return (__m512bh)(_mm512_or_si512(
+      _mm512_maskz_mov_epi16(
+          _mm512_cmpneq_epi16_mask(_mm512_and_si512(x, _mm512_set1_epi16(127)), _mm512_setzero_si512()),
+          _mm512_mask_blend_epi16(
+              _mm512_test_epi16_mask(x, _mm512_set1_epi16(120)),
+              _mm512_or_si512(
+                  _mm512_and_si512(
+                      _mm512_sllv_epi16(
+                          _mm512_and_si512(x, _mm512_set1_epi16(3)), _mm512_sub_epi16(_mm512_set1_epi16(7), lg2mant)),
+                      _mm512_set1_epi16(0x007f)),
+                  _mm512_slli_epi16(_mm512_add_epi16(lg2mant, _mm512_set1_epi16(118)), 7)),
+              _mm512_or_si512(
+                  _mm512_slli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(7)), 4),
+                  _mm512_slli_epi16(
+                      _mm512_add_epi16(
+                          _mm512_srli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(120)), 3), _mm512_set1_epi16(120)),
+                      7)))),
+      _mm512_slli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(128)), 8)));
+}
+
+inline __m512bh CVT_FP8_TO_BF16(__m256i a) {
+#ifdef SGLANG_CPU_FP8_CVT_FTZ
+  return cvt_e4m3_bf16_intrinsic_no_nan(a);
+#else
+  return cvt_e4m3_bf16_intrinsic_with_denorm(a);
+#endif
+}
+
+#endif
+
+// vector to scalar reduction
+#if defined(CPU_CAPABILITY_AVX512) && 0
+inline float vec_reduce_sum(const Vectorized<float>& a) {
+  return _mm512_reduce_add_ps(__m512(a));
+}
+
+inline float vec_reduce_max(const Vectorized<float>& a) {
+  return _mm512_reduce_max_ps(__m512(a));
+}
+#else
+inline float vec_reduce_sum(const Vectorized<float>& a) {
+  return vec_reduce_all([](Vectorized<float>& x, Vectorized<float>& y) { return x + y; }, a);
+}
+
+inline float vec_reduce_max(const Vectorized<float>& a) {
+  return vec_reduce_all([](Vectorized<float>& x, Vectorized<float>& y) { return maximum(x, y); }, a);
+}
+#endif
+
+// https://github.com/InternLM/lmdeploy/blob/086481ed84b59bee3b8e4274e5fc69620040c048/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py#L282
+template <typename scalar_t>
+inline void quantize_row_int8(uint8_t* __restrict__ Aq, float& As,
+    const scalar_t* __restrict__ A, int64_t K, float eps = 1e-7) {
+
+  float amax = 0.f; // absolute max
+  for (int64_t k = 0; k < K; ++k) {
+    const float val = static_cast<float>(A[k]);
+    amax = std::max(amax, std::abs(val));
+  }
+
+  amax = std::max(amax, eps);
+  const float scale = amax / 127;
+  const float inv_scale = 127 / amax;
+
+  for (int64_t k = 0; k < K; ++k) {
+    const float val = static_cast<float>(A[k]) * inv_scale;
+    Aq[k] = (uint8_t)(std::round(val)) + 128;
+  }
+  As = scale;
+}
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <>
+inline void quantize_row_int8<at::BFloat16>(uint8_t* __restrict__ Aq, float& As,
+    const at::BFloat16* __restrict__ A, int64_t K, float eps) {
+
+  const __m512 signBit = _mm512_set1_ps(-0.0f);
+  const __m512i off = _mm512_set1_epi32(128);
+
+  // K is 32x, no remainder
+  float amax = 0.f;
+  __m512 vamax0 = _mm512_set1_ps(0.f);
+  __m512 vamax1 = _mm512_set1_ps(0.f);
+  for (int64_t k = 0; k < K; k += 32) {
+    __m512i va = _mm512_loadu_si512((void*)(A + k));
+    __m512 va0 = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32(va, 0));
+    __m512 va1 = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32(va, 1));
+    vamax0 = _mm512_max_ps(vamax0, _mm512_andnot_ps(signBit, va0));
+    vamax1 = _mm512_max_ps(vamax1, _mm512_andnot_ps(signBit, va1));
+  }
+  amax = _mm512_reduce_max_ps(_mm512_max_ps(vamax0, vamax1));
+  amax = std::max(amax, eps);
+  const float scale = amax / 127;
+  const float inv_scale = 127 / amax;
+  const __m512 vd = _mm512_set1_ps(inv_scale);
+
+  for (int64_t k = 0; k < K; k += 32) {
+    __m512i va = _mm512_loadu_si512((void*)(A + k));
+    __m512 va0 = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32(va, 0));
+    __m512 va1 = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32(va, 1));
+    va0 = _mm512_mul_ps(va0, vd);
+    va1 = _mm512_mul_ps(va1, vd);
+    va0 = _mm512_roundscale_ps(va0, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+    va1 = _mm512_roundscale_ps(va1, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+    __m128i i0 = _mm512_cvtepi32_epi8(_mm512_add_epi32(_mm512_cvtps_epi32(va0), off));
+    __m128i i1 = _mm512_cvtepi32_epi8(_mm512_add_epi32(_mm512_cvtps_epi32(va1), off));
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(Aq + k), _mm256_set_m128i(i1, i0));
+  }
+  As = scale;
+}
+#endif
+
+// transpose utils
+// taken from my PR in ggml: https://github.com/ggml-org/llama.cpp/pull/8998
+#if defined(CPU_CAPABILITY_AVX512)
+inline void transpose_16x16_32bit(__m512i * v) {
+  __m512i v1[16];
+  v1[0] = _mm512_unpacklo_epi32(v[0], v[1]);
+  v1[1] = _mm512_unpackhi_epi32(v[0], v[1]);
+  v1[2] = _mm512_unpacklo_epi32(v[2], v[3]);
+  v1[3] = _mm512_unpackhi_epi32(v[2], v[3]);
+  v1[4] = _mm512_unpacklo_epi32(v[4], v[5]);
+  v1[5] = _mm512_unpackhi_epi32(v[4], v[5]);
+  v1[6] = _mm512_unpacklo_epi32(v[6], v[7]);
+  v1[7] = _mm512_unpackhi_epi32(v[6], v[7]);
+  v1[8] = _mm512_unpacklo_epi32(v[8], v[9]);
+  v1[9] = _mm512_unpackhi_epi32(v[8], v[9]);
+  v1[10] = _mm512_unpacklo_epi32(v[10], v[11]);
+  v1[11] = _mm512_unpackhi_epi32(v[10], v[11]);
+  v1[12] = _mm512_unpacklo_epi32(v[12], v[13]);
+  v1[13] = _mm512_unpackhi_epi32(v[12], v[13]);
+  v1[14] = _mm512_unpacklo_epi32(v[14], v[15]);
+  v1[15] = _mm512_unpackhi_epi32(v[14], v[15]);
+
+  v[0] = _mm512_unpacklo_epi64(v1[0], v1[2]);
+  v[1] = _mm512_unpackhi_epi64(v1[0], v1[2]);
+  v[2] = _mm512_unpacklo_epi64(v1[1], v1[3]);
+  v[3] = _mm512_unpackhi_epi64(v1[1], v1[3]);
+  v[4] = _mm512_unpacklo_epi64(v1[4], v1[6]);
+  v[5] = _mm512_unpackhi_epi64(v1[4], v1[6]);
+  v[6] = _mm512_unpacklo_epi64(v1[5], v1[7]);
+  v[7] = _mm512_unpackhi_epi64(v1[5], v1[7]);
+  v[8] = _mm512_unpacklo_epi64(v1[8], v1[10]);
+  v[9] = _mm512_unpackhi_epi64(v1[8], v1[10]);
+  v[10] = _mm512_unpacklo_epi64(v1[9], v1[11]);
+  v[11] = _mm512_unpackhi_epi64(v1[9], v1[11]);
+  v[12] = _mm512_unpacklo_epi64(v1[12], v1[14]);
+  v[13] = _mm512_unpackhi_epi64(v1[12], v1[14]);
+  v[14] = _mm512_unpacklo_epi64(v1[13], v1[15]);
+  v[15] = _mm512_unpackhi_epi64(v1[13], v1[15]);
+
+  v1[0] = _mm512_shuffle_i32x4(v[0], v[4], 0x88);
+  v1[1] = _mm512_shuffle_i32x4(v[1], v[5], 0x88);
+  v1[2] = _mm512_shuffle_i32x4(v[2], v[6], 0x88);
+  v1[3] = _mm512_shuffle_i32x4(v[3], v[7], 0x88);
+  v1[4] = _mm512_shuffle_i32x4(v[0], v[4], 0xdd);
+  v1[5] = _mm512_shuffle_i32x4(v[1], v[5], 0xdd);
+  v1[6] = _mm512_shuffle_i32x4(v[2], v[6], 0xdd);
+  v1[7] = _mm512_shuffle_i32x4(v[3], v[7], 0xdd);
+  v1[8] = _mm512_shuffle_i32x4(v[8], v[12], 0x88);
+  v1[9] = _mm512_shuffle_i32x4(v[9], v[13], 0x88);
+  v1[10] = _mm512_shuffle_i32x4(v[10], v[14], 0x88);
+  v1[11] = _mm512_shuffle_i32x4(v[11], v[15], 0x88);
+  v1[12] = _mm512_shuffle_i32x4(v[8], v[12], 0xdd);
+  v1[13] = _mm512_shuffle_i32x4(v[9], v[13], 0xdd);
+  v1[14] = _mm512_shuffle_i32x4(v[10], v[14], 0xdd);
+  v1[15] = _mm512_shuffle_i32x4(v[11], v[15], 0xdd);
+
+  v[0] = _mm512_shuffle_i32x4(v1[0], v1[8], 0x88);
+  v[1] = _mm512_shuffle_i32x4(v1[1], v1[9], 0x88);
+  v[2] = _mm512_shuffle_i32x4(v1[2], v1[10], 0x88);
+  v[3] = _mm512_shuffle_i32x4(v1[3], v1[11], 0x88);
+  v[4] = _mm512_shuffle_i32x4(v1[4], v1[12], 0x88);
+  v[5] = _mm512_shuffle_i32x4(v1[5], v1[13], 0x88);
+  v[6] = _mm512_shuffle_i32x4(v1[6], v1[14], 0x88);
+  v[7] = _mm512_shuffle_i32x4(v1[7], v1[15], 0x88);
+  v[8] = _mm512_shuffle_i32x4(v1[0], v1[8], 0xdd);
+  v[9] = _mm512_shuffle_i32x4(v1[1], v1[9], 0xdd);
+  v[10] = _mm512_shuffle_i32x4(v1[2], v1[10], 0xdd);
+  v[11] = _mm512_shuffle_i32x4(v1[3], v1[11], 0xdd);
+  v[12] = _mm512_shuffle_i32x4(v1[4], v1[12], 0xdd);
+  v[13] = _mm512_shuffle_i32x4(v1[5], v1[13], 0xdd);
+  v[14] = _mm512_shuffle_i32x4(v1[6], v1[14], 0xdd);
+  v[15] = _mm512_shuffle_i32x4(v1[7], v1[15], 0xdd);
+}
+
+// remove warning : ignoring attributes on template argument ‘__m512i’ [-Wignored-attributes]
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wignored-attributes"
+
+// transpose from [2, 32] to [32, 2]
+inline std::tuple<__m512i, __m512i> transpose_2x32_16bit(__m512i r0, __m512i r1) {
+  // r0: {a0, a1, ..., a31}
+  // r1: {b0, b1, ..., b31}
+  //
+  // d0: {a0,   b0, ..., a15, b15}
+  // d1: {a16, b16, ..., a31, b31}
+  //
+  __m512i d0 = _mm512_unpacklo_epi16(r0, r1);
+  __m512i d1 = _mm512_unpackhi_epi16(r0, r1);
+  r0 = _mm512_shuffle_i32x4(d0, d1, 0x88);
+  r1 = _mm512_shuffle_i32x4(d0, d1, 0xdd);
+  d0 = _mm512_shuffle_i32x4(r0, r1, 0x88);
+  d1 = _mm512_shuffle_i32x4(r0, r1, 0xdd);
+  return std::make_tuple(d0, d1);
+}
+#pragma GCC diagnostic pop
+
+#endif
+
+// TODO: debug print, remove me later
+template<typename scalar_t>
+void print_array(scalar_t* ptr, int size) {
+  for (int d = 0; d < size; ++d) {
+    if (d % 16 == 0) { std::cout << std::endl; }
+    std::cout << ptr[d] << " ";
+  }
+  std::cout << std::endl;
+}
+
+} // anonymous namespace
diff --git a/csrc/cpu/shm.cpp b/csrc/cpu/shm.cpp
index f55e96de251d0..9adb6f27ec411 100644
--- a/csrc/cpu/shm.cpp
+++ b/csrc/cpu/shm.cpp
@@ -7,9 +7,10 @@
 
 namespace {
 #define MAX_SHM_RANK_NUM 8
-#define MAX_THREAD_NUM 12
-#define PER_THREAD_SHM_BUFFER_BYTES (4 * 1024 * 1024)
-#define MIN_THREAD_PROCESS_SIZE (8 * 1024)
+#define PER_THREAD_SHM_BUFFER_BYTES (2 * 1024 * 1024)
+static_assert(PER_THREAD_SHM_BUFFER_BYTES % 2 == 0);
+#define PER_THREAD_SHM_BUFFER_OFFSET (PER_THREAD_SHM_BUFFER_BYTES >> 1)
+#define MIN_THREAD_PROCESS_SIZE (256)
 #define MAX_P2P_SEND_TENSOR_NUM 8
 
 template <typename scalar_t>
@@ -32,10 +33,10 @@ struct KernelVecType<c10::Half> {
   using scalar_vec_t = vec_op::FP16Vec16;
 };
 
-enum class ThreadSHMStat : char { THREAD_READY = 0, SHM_DATA_READY, DONE };
-
 struct ThreadSHMContext {
-  volatile ThreadSHMStat thread_stats[MAX_SHM_RANK_NUM];
+  volatile char _curr_thread_stamp;
+  volatile char _ready_thread_stamp;
+  char _padding1[6];
   int thread_id;
   int thread_num;
   int rank;
@@ -44,14 +45,19 @@ struct ThreadSHMContext {
   int swizzled_ranks[MAX_SHM_RANK_NUM];
   void* thread_shm_ptrs[MAX_SHM_RANK_NUM];
   ThreadSHMContext* shm_contexts[MAX_SHM_RANK_NUM];
+  size_t _thread_buffer_mask;
+  char _padding2[56];
 
   ThreadSHMContext(const int thread_id, const int thread_num, const int rank,
                    const int group_size, void* thread_shm_ptr)
-      : thread_id(thread_id),
+      : _curr_thread_stamp(1),
+        _ready_thread_stamp(0),
+        thread_id(thread_id),
         thread_num(thread_num),
         rank(rank),
         group_size(group_size),
-        _spinning_count(0) {
+        _spinning_count(0),
+        _thread_buffer_mask(0) {
     static_assert(sizeof(ThreadSHMContext) % 64 == 0);
     TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM);
     TORCH_CHECK((size_t)this % 64 == 0);
@@ -60,7 +66,6 @@ struct ThreadSHMContext {
       shm_contexts[i] = nullptr;
       thread_shm_ptrs[i] = nullptr;
       swizzled_ranks[i] = (i + rank) % group_size;
-      thread_stats[i] = ThreadSHMStat::DONE;
     }
     set_context(rank, this, thread_shm_ptr);
   }
@@ -77,59 +82,66 @@ struct ThreadSHMContext {
 
   template <typename T>
   T* get_thread_shm_ptr(int rank) {
-    return reinterpret_cast<T*>(thread_shm_ptrs[rank]);
+    return reinterpret_cast<T*>(
+        reinterpret_cast<int8_t*>(thread_shm_ptrs[rank]) +
+        (PER_THREAD_SHM_BUFFER_OFFSET & _thread_buffer_mask));
+  }
+
+  void next_buffer() { _thread_buffer_mask ^= 0xFFFFFFFFFFFFFFFF; }
+
+  char get_curr_stamp() const { return _curr_thread_stamp; }
+
+  char get_ready_stamp() const { return _ready_thread_stamp; }
+
+  void next_stamp() {
+    _mm_mfence();
+    _curr_thread_stamp += 1;
+  }
+
+  void commit_ready_stamp() {
+    _mm_mfence();
+    _ready_thread_stamp = _curr_thread_stamp;
   }
 
   int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; }
 
-  void wait_for_all(ThreadSHMStat prev_stat) {
-    for (int idx = 0; idx < group_size; ++idx) {
+  template <typename Cond>
+  void wait_for_all(Cond&& cond) {
+    for (int idx = 1; idx < group_size; ++idx) {
       int rank = get_swizzled_rank(idx);
-      while (thread_stats[rank] == prev_stat) {
-        ++_spinning_count;
-        _mm_pause();
-      }
+      wait_for_one(rank, std::forward<Cond>(cond));
     }
-    vec_op::mem_barrier();
   }
 
-  void wait_for_one(int rank, ThreadSHMStat prev_stat) {
-    while (thread_stats[rank] == prev_stat) {
+  template <typename Cond>
+  void wait_for_one(int rank, Cond&& cond) {
+    ThreadSHMContext* rank_ctx = shm_contexts[rank];
+    for (;;) {
+      char local_curr_stamp = get_curr_stamp();
+      char local_ready_stamp = get_ready_stamp();
+      char rank_curr_stamp = rank_ctx->get_curr_stamp();
+      char rank_ready_stamp = rank_ctx->get_ready_stamp();
+      if (cond(local_curr_stamp, local_ready_stamp, rank_curr_stamp,
+               rank_ready_stamp)) {
+        break;
+      }
       ++_spinning_count;
       _mm_pause();
     }
-    vec_op::mem_barrier();
   }
 
-  void set_thread_stat(ThreadSHMStat stat) {
-    for (int idx = 0; idx < group_size; ++idx) {
-      int rank = get_swizzled_rank(idx);
-      shm_contexts[rank]->thread_stats[this->rank] = stat;
-    }
+  static bool check_no_buffer_conflict(char local_curr_stamp,
+                                       char local_ready_stamp,
+                                       char rank_curr_stamp,
+                                       char rank_ready_stamp) {
+    char temp = rank_curr_stamp + 2;
+    return local_curr_stamp != temp;
   }
 
-  void set_thread_stat(int target_rank, ThreadSHMStat stat) {
-    for (int idx = 0; idx < group_size; ++idx) {
-      int rank = get_swizzled_rank(idx);
-      shm_contexts[rank]->thread_stats[target_rank] = stat;
-    }
-  }
-
-  // barrier for all ranks in the group, used for all2all ops
-  // DONE -> THREAD_READY -> SHM_DATA_READY -> DONE -> ...
-  void barrier(ThreadSHMStat next_stat) {
-    if (next_stat == ThreadSHMStat::THREAD_READY) {
-      set_thread_stat(ThreadSHMStat::THREAD_READY);
-      wait_for_all(ThreadSHMStat::DONE);
-    } else if (next_stat == ThreadSHMStat::SHM_DATA_READY) {
-      set_thread_stat(ThreadSHMStat::SHM_DATA_READY);
-      wait_for_all(ThreadSHMStat::THREAD_READY);
-    } else if (next_stat == ThreadSHMStat::DONE) {
-      set_thread_stat(ThreadSHMStat::DONE);
-      wait_for_all(ThreadSHMStat::SHM_DATA_READY);
-    } else {
-      TORCH_CHECK(false, "Invalid next_stat to barrier.");
-    }
+  static bool check_stamp_ready(char local_curr_stamp, char local_ready_stamp,
+                                char rank_curr_stamp, char rank_ready_stamp) {
+    char temp = local_curr_stamp + 1;
+    return (local_curr_stamp == rank_ready_stamp) || (temp == rank_ready_stamp);
   }
 
   std::string to_string() const {
@@ -164,7 +176,7 @@ class SHMManager {
                       const int group_size)
       : _rank(rank),
         _group_size(group_size),
-        _thread_num(std::min(torch::get_num_threads(), MAX_THREAD_NUM)),
+        _thread_num(torch::get_num_threads()),
         _shm_names({""}),
         _shared_mem_ptrs({nullptr}),
         _shm_ctx(nullptr) {
@@ -326,7 +338,8 @@ void shm_cc_loop(ThreadSHMContext* ctx, int64_t elem_num, F&& inner_func) {
       (total_units_num + thread_num - 1) / thread_num;
   int64_t per_unit_elem_num = MIN_THREAD_PROCESS_SIZE / sizeof(scalar_t);
   int64_t max_per_thread_iteration_elem_num =
-      PER_THREAD_SHM_BUFFER_BYTES / sizeof(scalar_t);
+      (PER_THREAD_SHM_BUFFER_BYTES >> 1) /
+      sizeof(scalar_t);  // Note: double buffer
   int64_t per_thread_elem_num = per_unit_elem_num * per_thread_units_num;
 
 #pragma omp parallel for schedule(static, 1)
@@ -336,10 +349,13 @@ void shm_cc_loop(ThreadSHMContext* ctx, int64_t elem_num, F&& inner_func) {
     int64_t curr_elem_num =
         std::min(max_per_thread_iteration_elem_num, end - offset);
     ThreadSHMContext* thread_ctx = ctx + i;
+    bool fast_mode = ((end - offset) <= max_per_thread_iteration_elem_num);
 
     while (curr_elem_num > 0) {
-      inner_func(thread_ctx, offset, curr_elem_num);
+      inner_func(thread_ctx, offset, curr_elem_num, fast_mode);
 
+      thread_ctx->next_stamp();
+      thread_ctx->next_buffer();
       offset += max_per_thread_iteration_elem_num;
       curr_elem_num = std::min(max_per_thread_iteration_elem_num, end - offset);
     }
@@ -397,7 +413,7 @@ void all_reduce_sum_impl(ThreadSHMContext* ctx, scalar_t* data,
   shm_cc_ops::shm_cc_loop<scalar_t>(
       ctx, elem_num,
       [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
-          int64_t data_elem_num) {
+          int64_t data_elem_num, bool fast_mode) {
         int rank = thread_ctx->rank;
         scalar_t* thread_shm_ptr =
             thread_ctx->get_thread_shm_ptr<scalar_t>(rank);
@@ -410,16 +426,17 @@ void all_reduce_sum_impl(ThreadSHMContext* ctx, scalar_t* data,
               thread_ctx->get_swizzled_rank(idx + 1));
         });
 
-        thread_ctx->barrier(ThreadSHMStat::THREAD_READY);
+        if (!fast_mode) {
+          thread_ctx->wait_for_all(ThreadSHMContext::check_no_buffer_conflict);
+        }
 
         shm_cc_ops::memcpy_to_shm(thread_shm_ptr, thread_data_ptr,
                                   thread_data_elem_num);
-
-        thread_ctx->barrier(ThreadSHMStat::SHM_DATA_READY);
-
+        thread_ctx->commit_ready_stamp();
         int64_t aligned_data_elem_num =
             (data_elem_num / vec_elem_num) * vec_elem_num;
         int64_t i = 0;
+        thread_ctx->wait_for_all(ThreadSHMContext::check_stamp_ready);
 #pragma GCC unroll 4
         for (; i < aligned_data_elem_num; i += vec_elem_num) {
           vec_t local_data(thread_data_ptr + i);  // load from cache
@@ -447,8 +464,6 @@ void all_reduce_sum_impl(ThreadSHMContext* ctx, scalar_t* data,
           reduced_data.save(thread_data_ptr + i,
                             data_elem_num - aligned_data_elem_num);
         }
-
-        thread_ctx->barrier(ThreadSHMStat::DONE);
       });
 
   return;
@@ -488,18 +503,18 @@ void shm_gather_impl(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num,
   shm_cc_ops::shm_cc_loop<scalar_t>(
       ctx, elem_num,
       [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
-          int64_t data_elem_num) {
+          int64_t data_elem_num, bool fast_mode) {
         int rank = thread_ctx->rank;
         scalar_t* thread_shm_ptr =
             thread_ctx->get_thread_shm_ptr<scalar_t>(rank);
 
-        thread_ctx->barrier(ThreadSHMStat::THREAD_READY);
-
-        shm_cc_ops::memcpy_to_shm(thread_shm_ptr, data + data_offset,
-                                  data_elem_num * sizeof(scalar_t));
-
-        thread_ctx->barrier(ThreadSHMStat::SHM_DATA_READY);
+        if (!fast_mode) {
+          thread_ctx->wait_for_all(ThreadSHMContext::check_no_buffer_conflict);
+        }
 
+        shm_cc_ops::memcpy(thread_shm_ptr, data + data_offset,
+                           data_elem_num * sizeof(scalar_t));
+        thread_ctx->commit_ready_stamp();
         if (rank == dst) {
           shm_cc_ops::memcpy(outputs[rank] + data_offset, data + data_offset,
                              data_elem_num * sizeof(scalar_t));
@@ -508,12 +523,12 @@ void shm_gather_impl(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num,
             scalar_t* src_ptr =
                 thread_ctx->get_thread_shm_ptr<scalar_t>(src_rank);  // shm
             scalar_t* dst_ptr = outputs[src_rank] + data_offset;
-            shm_cc_ops::memcpy_from_shm(dst_ptr, src_ptr,
-                                        data_elem_num * sizeof(scalar_t));
+            thread_ctx->wait_for_one(src_rank,
+                                     ThreadSHMContext::check_stamp_ready);
+            shm_cc_ops::memcpy(dst_ptr, src_ptr,
+                               data_elem_num * sizeof(scalar_t));
           }
         }
-
-        thread_ctx->barrier(ThreadSHMStat::DONE);
       });
 
   return;
@@ -599,7 +614,7 @@ struct TensorListMeta {
   int8_t _padding[40];
 };
 
-void shm_send_tensor_list_impl(ThreadSHMContext* ctx,
+void shm_send_tensor_list_impl(ThreadSHMContext* ctx, int64_t dst,
                                const std::vector<torch::Tensor>& tensor_list) {
   CPU_KERNEL_GUARD_IN(shm_send_tensor_list_impl)
   std::vector<torch::Tensor> tensor_list_with_metadata;
@@ -620,12 +635,11 @@ void shm_send_tensor_list_impl(ThreadSHMContext* ctx,
   shm_cc_ops::shm_cc_loop<int8_t>(
       ctx, metadata->total_bytes,
       [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
-          int64_t data_elem_num) {
+          int64_t data_elem_num, bool fast_mode) {
         int rank = thread_ctx->rank;
-        // Wait until the receiver set the stat to DONE
-        thread_ctx->wait_for_one(rank, ThreadSHMStat::SHM_DATA_READY);
-
         int64_t curr_shm_offset = 0;
+        thread_ctx->wait_for_one(dst,
+                                 ThreadSHMContext::check_no_buffer_conflict);
         while (curr_shm_offset < data_elem_num) {
           MemPiece frag = metadata->get_data(data_offset + curr_shm_offset);
           frag.size = std::min(frag.size, data_elem_num - curr_shm_offset);
@@ -634,8 +648,7 @@ void shm_send_tensor_list_impl(ThreadSHMContext* ctx,
               frag.ptr, frag.size);
           curr_shm_offset += frag.size;
         }
-
-        thread_ctx->set_thread_stat(rank, ThreadSHMStat::SHM_DATA_READY);
+        thread_ctx->commit_ready_stamp();
       });
 }
 
@@ -646,8 +659,7 @@ std::vector<torch::Tensor> shm_recv_tensor_list_impl(ThreadSHMContext* ctx,
   torch::Tensor metadata_tensor =
       torch::empty({sizeof(TensorListMeta)}, options);
 
-  // Wait until the sender set the stat of the thread 0 to SHM_DATA_READY
-  ctx->wait_for_one(src, ThreadSHMStat::DONE);
+  ctx->wait_for_one(src, ThreadSHMContext::check_stamp_ready);
   shm_cc_ops::memcpy(metadata_tensor.data_ptr(),
                      ctx->get_thread_shm_ptr<void>(src),
                      sizeof(TensorListMeta));
@@ -664,9 +676,8 @@ std::vector<torch::Tensor> shm_recv_tensor_list_impl(ThreadSHMContext* ctx,
   shm_cc_ops::shm_cc_loop<int8_t>(
       ctx, metadata.total_bytes,
       [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
-          int64_t data_elem_num) {
-        // Wait until the sender set the stat to SHM_DATA_READY
-        thread_ctx->wait_for_one(src, ThreadSHMStat::DONE);
+          int64_t data_elem_num, bool fast_mode) {
+        ctx->wait_for_one(src, ThreadSHMContext::check_stamp_ready);
         int64_t curr_shm_offset = 0;
         while (curr_shm_offset < data_elem_num) {
           MemPiece frag = metadata.get_data(data_offset + curr_shm_offset);
@@ -677,8 +688,6 @@ std::vector<torch::Tensor> shm_recv_tensor_list_impl(ThreadSHMContext* ctx,
               frag.size);
           curr_shm_offset += frag.size;
         }
-
-        thread_ctx->set_thread_stat(src, ThreadSHMStat::DONE);
       });
 
   std::vector<torch::Tensor> tensor_list;
@@ -756,7 +765,8 @@ void shm_send_tensor_list(int64_t handle,
                           int64_t dst) {
   CPU_KERNEL_GUARD_IN(shm_send_tensor_list)
   shm_send_tensor_list_impl(
-      SHMManager::get_singleton_instance(handle)->get_shm_ctx(), tensor_list);
+      SHMManager::get_singleton_instance(handle)->get_shm_ctx(), dst,
+      tensor_list);
   CPU_KERNEL_GUARD_OUT(shm_send_tensor_list)
 }
 
@@ -778,4 +788,4 @@ std::string join_shm_manager(int64_t handle, const std::string& name) {
   TORCH_CHECK(shm_manager);
   shm_manager->join(name);
   return shm_manager->get_shm_ctx()->to_string();
-}
\ No newline at end of file
+}
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index 60304d229a8f5..ebfc81f858367 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -50,6 +50,27 @@ void shm_send_tensor_list(int64_t handle,
 
 std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src);
 
+at::Tensor weight_packed_linear(at::Tensor& mat1, at::Tensor& mat2,
+                                const std::optional<at::Tensor>& bias,
+                                bool is_vnni);
+
+at::Tensor convert_weight_packed(at::Tensor& weight);
+
+at::Tensor fused_experts_cpu(
+    at::Tensor& hidden_states, at::Tensor& w1, at::Tensor& w2,
+    at::Tensor& topk_weights, at::Tensor& topk_ids, bool inplace,
+    bool use_int8_w8a8, bool use_fp8_w8a16,
+    const std::optional<at::Tensor>& w1_scale,
+    const std::optional<at::Tensor>& w2_scale,
+    const std::optional<std::vector<int64_t>> block_size,
+    const std::optional<at::Tensor>& a1_scale,
+    const std::optional<at::Tensor>& a2_scale, bool is_vnni);
+
+at::Tensor int8_scaled_mm_with_quant(at::Tensor& mat1, at::Tensor& mat2,
+                                     at::Tensor& scales2,
+                                     const std::optional<at::Tensor>& bias,
+                                     at::ScalarType out_dtype, bool is_vnni);
+
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
 
@@ -214,6 +235,28 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("shm_recv_tensor_list(int handle, int src) -> Tensor[](a)",
           &shm_recv_tensor_list);
 #endif
+
+  // sgl-kernels
+#if defined(__AVX512BF16__) && defined(__AVX512F__) && defined(__AVX512VNNI__)
+  ops.def(
+      "weight_packed_linear(Tensor(a0!) mat1, Tensor(a1!) mat2, Tensor(a2!)? "
+      "bias, bool is_vnni) -> Tensor");
+  ops.impl("weight_packed_linear", torch::kCPU, &weight_packed_linear);
+  ops.def("convert_weight_packed(Tensor! weight) -> Tensor");
+  ops.impl("convert_weight_packed", torch::kCPU, &convert_weight_packed);
+  ops.def(
+      "fused_experts_cpu(Tensor! hidden_states, Tensor w1, Tensor w2, Tensor "
+      "topk_weights, Tensor topk_ids, bool inplace, bool use_int8_w8a8, bool "
+      "use_fp8_w8a16, Tensor? w1_scale, Tensor? w2_scale, SymInt[]? "
+      "block_size, Tensor? a1_scale, Tensor? a2_scale, bool is_vnni) -> "
+      "Tensor");
+  ops.impl("fused_experts_cpu", torch::kCPU, &fused_experts_cpu);
+  ops.def(
+      "int8_scaled_mm_with_quant(Tensor mat1, Tensor mat2, Tensor scales2, "
+      "Tensor? bias, ScalarType out_dtype, bool is_vnni) -> Tensor");
+  ops.impl("int8_scaled_mm_with_quant", torch::kCPU,
+           &int8_scaled_mm_with_quant);
+#endif
 }
 
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index 370b854def0f3..5f2d0dbe27d34 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -118,6 +118,7 @@ vLLM CPU backend supports the following vLLM features:
 - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node. By setting to `all`, the OpenMP threads of each rank uses all CPU cores available on the system. Default value is `auto`.
 - `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `0`.
 - `VLLM_CPU_MOE_PREPACK`: whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).
+- `VLLM_CPU_SGL_KERNEL` (Experimental): whether to use small-batch optimized kernels for linear layer and MoE layer, especially for low-latency requirements like online serving. The kernels require AMX instruction set, BFloat16 weight type and weight shapes divisible by 32. Default is `0` (False).
 
 ## Performance tips
 
diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index f656f90c4bd37..7d7a62eec118a 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -78,7 +78,7 @@ AITER_MODEL_LIST = [
         ),
         pytest.param(
             "Qwen/Qwen2.5-0.5B-Instruct",  # qwen2
-            marks=[pytest.mark.core_model],
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
         ),
         pytest.param(
             "Qwen/Qwen3-8B",  # qwen (text-only)
@@ -87,6 +87,7 @@ AITER_MODEL_LIST = [
         pytest.param("bigcode/starcoder2-3b"),  # starcoder2
         pytest.param(
             "TitanML/tiny-mixtral",  # mixtral
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
         )
     ])
 @pytest.mark.parametrize("max_tokens", [32])
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 51900de1cc099..36a0395ccdc93 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1850,3 +1850,52 @@ def cutlass_mla_decode(out: torch.Tensor, q_nope: torch.Tensor,
     torch.ops._C.cutlass_mla_decode(out, q_nope, q_pe, kv_c_and_k_pe_cache,
                                     seq_lens, page_table, scale)
     return out
+
+
+if hasattr(torch.ops._C, "weight_packed_linear"):
+
+    @register_fake("_C::weight_packed_linear")
+    def weight_packed_linear_fake(mat1: torch.Tensor, mat2: torch.Tensor,
+                                  bias: Optional[torch.Tensor],
+                                  is_vnni: bool) -> torch.Tensor:
+        return torch.empty((mat1.size(0), mat2.size(0)),
+                           dtype=mat1.dtype,
+                           device=mat2.device)
+
+
+if hasattr(torch.ops._C, "fused_experts_cpu"):
+
+    @register_fake("_C::fused_experts_cpu")
+    def fused_experts_cpu_fake(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        inplace: bool,
+        use_int8_w8a8: bool,
+        use_fp8_w8a16: bool,
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        block_size: Optional[list[int]],
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        is_vnni: bool,
+    ) -> torch.Tensor:
+        return torch.empty_like(hidden_states)
+
+
+if hasattr(torch.ops._C, "int8_scaled_mm_with_quant"):
+
+    @register_fake("_C::int8_scaled_mm_with_quant")
+    def int8_scaled_mm_with_quant_fake(
+        mat1: torch.Tensor,
+        mat2: torch.Tensor,
+        scales2: torch.Tensor,
+        bias: Optional[torch.Tensor],
+        out_dtype: torch.dtype,
+        is_vnni: bool,
+    ) -> torch.Tensor:
+        M = mat1.size(0)
+        N = mat2.size(0)
+        return torch.empty((M, N), dtype=out_dtype)
diff --git a/vllm/envs.py b/vllm/envs.py
index a3f19c7ee5c70..c73dbb0a8446f 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -46,6 +46,7 @@ if TYPE_CHECKING:
     VLLM_CPU_OMP_THREADS_BIND: str = ""
     VLLM_CPU_NUM_OF_RESERVED_CPU: int = 0
     VLLM_CPU_MOE_PREPACK: bool = True
+    VLLM_CPU_SGL_KERNEL: bool = False
     VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
     VLLM_XLA_CHECK_RECOMPILATION: bool = False
     VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
@@ -447,6 +448,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_CPU_MOE_PREPACK":
     lambda: bool(int(os.getenv("VLLM_CPU_MOE_PREPACK", "1"))),
 
+    # (CPU backend only) whether to use SGL kernels, optimized for small batch.
+    "VLLM_CPU_SGL_KERNEL":
+    lambda: bool(int(os.getenv("VLLM_CPU_SGL_KERNEL", "0"))),
+
     # If the env var is set, then all workers will execute as separate
     # processes from the engine, and we use the same mechanism to trigger
     # execution on all workers.
diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
new file mode 100644
index 0000000000000..68ce6bcccb5d4
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
@@ -0,0 +1,214 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Callable, Optional
+
+import torch
+
+from vllm import envs
+
+
+class IPEXFusedMOE:
+
+    def __init__(self, layer: torch.nn.Module) -> None:
+        import intel_extension_for_pytorch as ipex
+        layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
+            layer.w13_weight,
+            layer.w2_weight,
+            use_prepack=envs.VLLM_CPU_MOE_PREPACK,
+        )
+
+    def __call__(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+    ) -> torch.Tensor:
+        assert activation == "silu", f"{activation} is not supported."
+        assert not apply_router_weight_on_input
+        return layer.ipex_fusion(
+            x,
+            use_grouped_topk,
+            top_k,
+            router_logits,
+            renormalize,
+            topk_group,
+            num_expert_group,
+            custom_routing_function,
+            scoring_func,
+            e_score_correction_bias,
+        )
+
+
+class SGLFusedMOE:
+
+    def __init__(self, layer: torch.nn.Module) -> None:
+        pass
+
+    @staticmethod
+    def _grouped_topk(
+        hidden_states: torch.Tensor,
+        gating_output: torch.Tensor,
+        topk: int,
+        renormalize: bool,
+        num_expert_group: int = 0,
+        topk_group: int = 0,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        assert hidden_states.shape[0] == gating_output.shape[0], (
+            "Number of tokens mismatch")
+
+        gating_output = gating_output.float()
+        if scoring_func == "softmax":
+            scores = torch.softmax(gating_output, dim=-1)
+        elif scoring_func == "sigmoid":
+            scores = gating_output.sigmoid()
+        else:
+            raise ValueError(f"Unsupported scoring function: {scoring_func}")
+
+        num_token = scores.shape[0]
+        if e_score_correction_bias is not None:
+            # Store original scores before applying correction bias. We use
+            # biased scores for expert selection but original scores for
+            # routing weights
+            original_scores = scores
+            scores = scores + e_score_correction_bias.unsqueeze(0)
+            group_scores = (scores.view(num_token, num_expert_group,
+                                        -1).topk(2, dim=-1)[0].sum(dim=-1))
+        else:
+            group_scores = scores.view(num_token, num_expert_group,
+                                       -1).max(dim=-1).values  # [n, n_group]
+        group_idx = torch.topk(group_scores,
+                               k=topk_group,
+                               dim=-1,
+                               sorted=False)[1]  # [n, top_k_group]
+        group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+        group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+        score_mask = group_mask.unsqueeze(-1).expand(
+            num_token, num_expert_group,
+            scores.shape[-1] // num_expert_group).reshape(num_token,
+                                                          -1)  # [n, e]
+        tmp_scores = scores.masked_fill(~score_mask.bool(),
+                                        float("-inf"))  # [n, e]
+
+        if e_score_correction_bias is not None:
+            topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)[1]
+            # Use original unbiased scores for the routing weights
+            topk_weights = original_scores.gather(1, topk_ids)
+        else:
+            topk_weights, topk_ids = torch.topk(tmp_scores,
+                                                k=topk,
+                                                dim=-1,
+                                                sorted=False)
+
+        if renormalize:
+            topk_weights = topk_weights / topk_weights.sum(dim=-1,
+                                                           keepdim=True)
+
+        return topk_weights, topk_ids.to(torch.int32)
+
+    @staticmethod
+    def _select_experts(
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        use_grouped_topk: bool,
+        renormalize: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # DeekSeekv2 uses grouped_top_k
+        if use_grouped_topk:
+            assert topk_group is not None
+            assert num_expert_group is not None
+            topk_weights, topk_ids = SGLFusedMOE._grouped_topk(
+                hidden_states=hidden_states,
+                gating_output=router_logits,
+                topk=top_k,
+                renormalize=renormalize,
+                num_expert_group=num_expert_group,
+                topk_group=topk_group,
+                scoring_func=scoring_func,
+                e_score_correction_bias=e_score_correction_bias)
+        elif custom_routing_function is None:
+            assert scoring_func == "softmax"
+            topk_weights = torch.nn.functional.softmax(router_logits,
+                                                       dim=1,
+                                                       dtype=torch.float32)
+            topk_weights, topk_ids = torch.topk(topk_weights, top_k, dim=-1)
+            if renormalize:
+                topk_weights /= topk_weights.sum(dim=-1, keepdim=True)
+            topk_ids = topk_ids.to(torch.int32)
+        else:
+            topk_weights, topk_ids = custom_routing_function(
+                hidden_states=hidden_states,
+                gating_output=router_logits,
+                topk=top_k,
+                renormalize=renormalize)
+
+        return topk_weights, topk_ids
+
+    def __call__(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+    ) -> torch.Tensor:
+        assert activation == "silu", f"{activation} is not supported."
+        assert not apply_router_weight_on_input
+        topk_weights, topk_ids = SGLFusedMOE._select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
+        )
+
+        torch.ops._C.fused_experts_cpu(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights,
+            topk_ids,
+            True,
+            False,
+            False,
+            None,
+            None,
+            None,
+            None,
+            None,
+            True,
+        )
+        return x
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index e6f555d315d8e..d6ead084af99c 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -550,12 +550,23 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
 
         if current_platform.is_cpu():
             if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
-                import intel_extension_for_pytorch as ipex
-                layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
-                    layer.w13_weight,
-                    layer.w2_weight,
-                    use_prepack=envs.VLLM_CPU_MOE_PREPACK,
-                )
+                from vllm.model_executor.layers.fused_moe import cpu_fused_moe
+                dtype = layer.w13_weight.dtype
+                if (envs.VLLM_CPU_SGL_KERNEL
+                        and torch._C._cpu._is_amx_tile_supported()
+                        and dtype == torch.bfloat16):
+                    packed_w13_weight = torch.ops._C.convert_weight_packed(
+                        layer.w13_weight)
+                    assert packed_w13_weight.size() == layer.w13_weight.size()
+                    layer.w13_weight.copy_(packed_w13_weight)
+                    del packed_w13_weight
+                    packed_w2_weight = torch.ops._C.convert_weight_packed(
+                        layer.w2_weight)
+                    assert packed_w2_weight.size() == layer.w2_weight.size()
+                    layer.w2_weight.copy_(packed_w2_weight)
+                    layer.cpu_fused_moe = cpu_fused_moe.SGLFusedMOE(layer)
+                else:
+                    layer.cpu_fused_moe = cpu_fused_moe.IPEXFusedMOE(layer)
             else:
                 raise NotImplementedError("CPU MOE only supports x86 arch.")
 
@@ -673,13 +684,12 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
-        activation: str = "silu",
         apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
         **kwargs,
     ):
-        assert activation == "silu", f"{activation} is not supported."
-        assert apply_router_weight_on_input is False
-        return layer.ipex_fusion(
+        return layer.cpu_fused_moe(
+            layer,
             x,
             use_grouped_topk,
             top_k,
@@ -687,9 +697,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             renormalize,
             topk_group,
             num_expert_group,
+            global_num_experts,
+            expert_map,
             custom_routing_function,
             scoring_func,
             e_score_correction_bias,
+            apply_router_weight_on_input,
+            activation,
         )
 
     def forward_hpu(
@@ -764,7 +778,12 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                                 expert_map=expert_map,
                                 renormalize=renormalize)
 
-    forward_native = forward_tpu if current_platform.is_tpu() else forward_cuda
+    if current_platform.is_tpu():
+        forward_native = forward_tpu
+    elif current_platform.is_cpu():
+        forward_native = forward_cpu
+    else:
+        forward_native = forward_cuda
 
 
 def determine_expert_map(
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 588aa8deb1832..a05ae0edbd775 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -9,6 +9,7 @@ import torch
 import torch.nn as nn
 from torch.nn.parameter import Parameter, UninitializedParameter
 
+from vllm import envs
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               split_tensor_along_last_dim,
@@ -27,6 +28,7 @@ from vllm.model_executor.parameter import (BasevLLMParameter,
                                            RowvLLMParameter)
 # yapf: enable
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
@@ -195,12 +197,33 @@ class UnquantizedLinearMethod(LinearMethodBase):
         layer.register_parameter("weight", weight)
         set_weight_attrs(weight, extra_weight_attrs)
 
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if current_platform.is_cpu() and envs.VLLM_CPU_SGL_KERNEL:
+            N, K = layer.weight.size()
+            dtype = layer.weight.dtype
+            if (torch._C._cpu._is_amx_tile_supported()
+                    and dtype == torch.bfloat16 and N % 32 == 0
+                    and K % 32 == 0):
+                packed_weight = torch.ops._C.convert_weight_packed(
+                    layer.weight)
+                assert packed_weight.size() == layer.weight.size()
+                layer.weight.copy_(packed_weight)
+                if layer.bias is not None:
+                    layer.bias = Parameter(layer.bias.to(torch.float32),
+                                           requires_grad=False)
+                layer.use_cpu_sgl = True
+            else:
+                logger.warning(
+                    "CPU SGL kernels require Intel AMX support,"
+                    " bfloat16 weight, IC and OC are divisible by 32.")
+                layer.use_cpu_sgl = False
+
     def apply(self,
               layer: torch.nn.Module,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
-        return dispatch_unquantized_gemm()(x, layer.weight, bias)
+        return dispatch_unquantized_gemm()(layer, x, layer.weight, bias)
 
 
 class LinearBase(torch.nn.Module):
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index 41b5253dca048..ad4ba9c0b827a 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -63,7 +63,15 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
     return logits
 
 
-def rocm_unquantized_gemm(x: torch.Tensor,
+def default_unquantized_gemm(layer: torch.nn.Module,
+                             x: torch.Tensor,
+                             weight: torch.Tensor,
+                             bias: Optional[torch.Tensor] = None):
+    return torch.nn.functional.linear(x, weight, bias)
+
+
+def rocm_unquantized_gemm(layer: torch.nn.Module,
+                          x: torch.Tensor,
                           weight: torch.Tensor,
                           bias: Optional[torch.Tensor] = None):
     from vllm.platforms.rocm import on_gfx9
@@ -89,7 +97,20 @@ def rocm_unquantized_gemm(x: torch.Tensor,
     return torch.nn.functional.linear(x, weight, bias)
 
 
+def cpu_unquantized_gemm(layer: torch.nn.Module,
+                         x: torch.Tensor,
+                         weight: torch.Tensor,
+                         bias: Optional[torch.Tensor] = None):
+    if getattr(layer, "use_cpu_sgl", False):
+        return torch.ops._C.weight_packed_linear(x, weight, bias, True)
+    else:
+        return torch.nn.functional.linear(x, weight, bias)
+
+
 def dispatch_unquantized_gemm() -> Callable[..., torch.Tensor]:
     if current_platform.is_rocm():
         return rocm_unquantized_gemm
-    return torch.nn.functional.linear
+    elif current_platform.is_cpu():
+        return cpu_unquantized_gemm
+    else:
+        return default_unquantized_gemm
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 9ff3a7a7327d9..f35f969781bd1 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -43,7 +43,7 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
               layer: torch.nn.Module,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        return dispatch_unquantized_gemm()(x, layer.weight, bias)
+        return dispatch_unquantized_gemm()(layer, x, layer.weight, bias)
 
     def embedding(self, layer: torch.nn.Module,
                   input_: torch.Tensor) -> torch.Tensor:
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 106bce162003f..dccd60f4463aa 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -194,6 +194,8 @@ class CpuPlatform(Platform):
                 "epilogue_fusion":
                 True,
             })
+            if compilation_config.use_inductor:
+                compilation_config.custom_ops = ["none"]
 
         if vllm_config.lora_config is not None:
             compilation_config.level = CompilationLevel.NO_COMPILATION

From 08d81f1014d174d4dd96518914c7ed9767c67a3f Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Tue, 1 Jul 2025 03:29:08 -0400
Subject: [PATCH 011/167] [Bugfix] Fix deepep tests (#20288)

Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
---
 tests/kernels/moe/test_deepep_deepgemm_moe.py | 2 +-
 tests/kernels/moe/test_deepep_moe.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index 475427f439289..008406c3f1593 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -30,7 +30,7 @@ if has_deep_ep():
     from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
         DeepEPLLPrepareAndFinalize)
 
-    from .deepep_utils import DeepEPHTArgs, DeepEPLLArgs, make_deepep_a2a
+    from .utils import DeepEPHTArgs, DeepEPLLArgs, make_deepep_a2a
 
 if has_deep_gemm():
     import deep_gemm
diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py
index 80a36dc39712a..94947c809e3a3 100644
--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@@ -31,7 +31,7 @@ if has_deep_ep():
     from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
         DeepEPLLPrepareAndFinalize)
 
-    from .deepep_utils import DeepEPHTArgs, DeepEPLLArgs, make_deepep_a2a
+    from .utils import DeepEPHTArgs, DeepEPLLArgs, make_deepep_a2a
 
 requires_deep_ep = pytest.mark.skipif(
     not has_deep_ep(),

From b1c1fe35a599cfd3c0404702c65c2381b025bc6a Mon Sep 17 00:00:00 2001
From: Kebe <mail@kebe7jun.com>
Date: Tue, 1 Jul 2025 15:33:22 +0800
Subject: [PATCH 012/167] [Misc] remove redundant char (#20287)

Signed-off-by: Kebe <mail@kebe7jun.com>
---
 benchmarks/benchmark_serving.py | 2 +-
 vllm/benchmarks/serve.py        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 886a51e1cbd9a..9b235266dff1a 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -551,7 +551,7 @@ async def benchmark(
         "total_input_tokens": metrics.total_input,
         "total_output_tokens": metrics.total_output,
         "request_throughput": metrics.request_throughput,
-        "request_goodput:": metrics.request_goodput if goodput_config_dict else None,
+        "request_goodput": metrics.request_goodput if goodput_config_dict else None,
         "output_throughput": metrics.output_throughput,
         "total_token_throughput": metrics.total_token_throughput,
         "input_lens": [output.prompt_len for output in outputs],
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 419284cca042e..8b16fea9e3d3c 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -498,7 +498,7 @@ async def benchmark(
         "total_input_tokens": metrics.total_input,
         "total_output_tokens": metrics.total_output,
         "request_throughput": metrics.request_throughput,
-        "request_goodput:":
+        "request_goodput":
         metrics.request_goodput if goodput_config_dict else None,
         "output_throughput": metrics.output_throughput,
         "total_token_throughput": metrics.total_token_throughput,

From 96453cfa831340788ef72c42bc2a1a2b4496a27f Mon Sep 17 00:00:00 2001
From: TY-AMD <tianyuan.wu@amd.com>
Date: Tue, 1 Jul 2025 16:12:19 +0800
Subject: [PATCH 013/167] [BugFix][V1][ROCm] Triton MLA uses V0 backend on V1
 engine (#19067)

Signed-off-by: Tianyuan Wu <Tianyuan.Wu@amd.com>
---
 .../attention/test_attention_selector.py      |  6 +-
 .../attention/test_rocm_attention_selector.py |  6 +-
 vllm/platforms/rocm.py                        | 10 +++-
 vllm/v1/attention/backends/mla/common.py      |  9 ++-
 vllm/v1/attention/backends/mla/triton_mla.py  | 57 +++++++++++++++++++
 5 files changed, 78 insertions(+), 10 deletions(-)

diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index f3e64155703c2..a8ed749ba13b5 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -106,10 +106,8 @@ def test_env(
                                                    block_size,
                                                    False,
                                                    use_mla=use_mla)
-                        if use_v1 and name != "TRITON_MLA":
-                            assert backend.get_name() == f"{name}_VLLM_V1"
-                        else:
-                            assert backend.get_name() == name
+                        expected = f"{name}_VLLM_V1" if use_v1 else name
+                        assert backend.get_name() == expected
                     else:
                         with pytest.raises(ValueError) as exc_info:
                             get_attn_backend(16,
diff --git a/tests/kernels/attention/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py
index ed58880cc9e6c..34311b9ccd767 100644
--- a/tests/kernels/attention/test_rocm_attention_selector.py
+++ b/tests/kernels/attention/test_rocm_attention_selector.py
@@ -35,7 +35,8 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
         m.setenv(STR_BACKEND_ENV_VAR, "TRITON_MLA")
         backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
                                    False, True)
-        assert backend.get_name() == "TRITON_MLA"
+        assert (backend.get_name() == "TRITON_MLA"
+                or backend.get_name() == "TRITON_MLA_VLLM_V1")
 
         # If attention backend is None
         # If use_mla is true
@@ -43,7 +44,8 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
         m.setenv(STR_BACKEND_ENV_VAR, None)
         backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
                                    False, True)
-        assert backend.get_name() == "TRITON_MLA"
+        assert (backend.get_name() == "TRITON_MLA"
+                or backend.get_name() == "TRITON_MLA_VLLM_V1")
 
         # change the attention backend to AITER MLA
         m.setenv(STR_BACKEND_ENV_VAR, "ROCM_AITER_MLA")
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 08d471d5a983c..ee53a76ceb6db 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -186,8 +186,14 @@ class RocmPlatform(Platform):
 
             if selected_backend == _Backend.TRITON_MLA:
                 if block_size != 1:
-                    logger.info("Using Triton MLA backend.")
-                    return "vllm.attention.backends.triton_mla.TritonMLABackend"  # noqa: E501
+                    if use_v1:
+                        logger.info_once(
+                            "Using Triton MLA backend on V1 engine.")
+                        return ("vllm.v1.attention.backends.mla."
+                                "triton_mla.TritonMLABackend")
+                    else:
+                        logger.info("Using Triton MLA backend.")
+                        return "vllm.attention.backends.triton_mla.TritonMLABackend"  # noqa: E501
                 else:
                     raise ValueError(
                         f" The selected backend, {selected_backend.name},"
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 1878ae74dbc6f..d45ec04472a69 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -640,7 +640,6 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
         self.qk_head_dim = qk_head_dim
         self.v_head_dim = v_head_dim
         self.kv_b_proj = kv_b_proj
-        self.vllm_flash_attn_version = get_flash_attn_version()
 
         # Handle the differences between the flash_attn_varlen from flash_attn
         # and the one from vllm_flash_attn. The former is used on RoCM and the
@@ -672,11 +671,17 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
             maybe_padded_v = torch.nn.functional.pad(
                 v, [0, q.shape[-1] - v.shape[-1]], value=0)
 
+        if is_vllm_fa:
+            kwargs["return_softmax_lse"] = return_softmax_lse
+        else:
+            # ROCm leverages the upstream flash_attn, which takes a parameter
+            # called "return_attn_probs" instead of return_softmax_lse
+            kwargs["return_attn_probs"] = return_softmax_lse
+
         attn_out = self.flash_attn_varlen_func(
             q=q,
             k=k,
             v=maybe_padded_v,
-            return_softmax_lse=return_softmax_lse,
             softmax_scale=softmax_scale,
             **kwargs,
         )
diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py
index e26d7909184b5..99938f22f108c 100644
--- a/vllm/v1/attention/backends/mla/triton_mla.py
+++ b/vllm/v1/attention/backends/mla/triton_mla.py
@@ -5,10 +5,14 @@ from typing import Any, Optional
 
 import torch
 
+from vllm import envs
 from vllm.attention.backends.abstract import (AttentionType,
                                               is_quantized_kv_cache)
 from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
+from vllm.attention.ops.triton_flash_attention import triton_attention
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.triton_utils import HAS_TRITON
 from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
                                                    MLACommonImpl,
                                                    MLACommonMetadata)
@@ -68,6 +72,59 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
             raise NotImplementedError(
                 "TritonMLA V1 with FP8 KV cache not yet supported")
 
+        self.use_triton_flash_attn = envs.VLLM_USE_TRITON_FLASH_ATTN
+        self.triton_fa_func = triton_attention if HAS_TRITON else None
+
+    def _flash_attn_varlen_diff_headdims_rocm(self,
+                                              q,
+                                              k,
+                                              v,
+                                              softmax_scale=None,
+                                              **kwargs):
+        assert self.triton_fa_func is not None
+
+        # Triton Attention requires a padded V
+        padded_v = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
+                                           value=0)
+        # The output of triton_attention is a tuple of
+        # [output_tensor, encoded_softmax] where encoded_softmax is always None
+        output_tensor, _ = self.triton_fa_func(
+            q,
+            k,
+            padded_v,
+            None,  # output
+            kwargs["cu_seqlens_q"],
+            kwargs["cu_seqlens_k"],
+            kwargs["max_seqlen_q"],
+            kwargs["max_seqlen_k"],
+            kwargs["causal"],
+            softmax_scale,
+            None,  # bias
+        )
+
+        return output_tensor
+
+    def _flash_attn_varlen_diff_headdims(self,
+                                         q,
+                                         k,
+                                         v,
+                                         return_softmax_lse=False,
+                                         softmax_scale=None,
+                                         **kwargs):
+        if current_platform.is_rocm() \
+            and self.use_triton_flash_attn \
+            and not return_softmax_lse:
+            return self._flash_attn_varlen_diff_headdims_rocm(
+                q, k, v, softmax_scale=softmax_scale, **kwargs)
+        else:
+            return super()._flash_attn_varlen_diff_headdims(
+                q,
+                k,
+                v,
+                return_softmax_lse=return_softmax_lse,
+                softmax_scale=softmax_scale,
+                **kwargs)
+
     def _forward_decode(
         self,
         q_nope: torch.Tensor,

From 787b13389e2c0b114074f0a0d715eeb6c0a2b0c5 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Tue, 1 Jul 2025 16:18:09 +0800
Subject: [PATCH 014/167] [doc] fix the incorrect logo in dark mode (#20289)

Signed-off-by: reidliu41 <reid201711@gmail.com>
---
 docs/README.md                    | 3 ++-
 docs/mkdocs/stylesheets/extra.css | 9 +++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/docs/README.md b/docs/README.md
index 9fb3137b31928..e1d1046951a59 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,7 +1,8 @@
 # Welcome to vLLM
 
 <figure markdown="span">
-  ![](./assets/logos/vllm-logo-text-light.png){ align="center" alt="vLLM" class="no-scaled-link" width="60%" }
+  ![](./assets/logos/vllm-logo-text-light.png){ align="center" alt="vLLM Light" class="logo-light" width="60%" }
+  ![](./assets/logos/vllm-logo-text-dark.png){ align="center" alt="vLLM Dark" class="logo-dark" width="60%" }
 </figure>
 
 <p style="text-align:center">
diff --git a/docs/mkdocs/stylesheets/extra.css b/docs/mkdocs/stylesheets/extra.css
index 248711f491b9d..892013c1cddfa 100644
--- a/docs/mkdocs/stylesheets/extra.css
+++ b/docs/mkdocs/stylesheets/extra.css
@@ -134,3 +134,12 @@ body[data-md-color-scheme="slate"] .md-nav__item--section > label.md-nav__link .
   opacity: 0.9;
   transform: translateY(2px);
 }
+
+/* For logo css */
+[data-md-color-scheme="default"] .logo-dark {
+  display: none;
+}
+
+[data-md-color-scheme="slate"] .logo-light {
+  display: none;
+}

From c05596f1a350f3d993c467959ed02492141c2527 Mon Sep 17 00:00:00 2001
From: Lionel Villard <villard@us.ibm.com>
Date: Tue, 1 Jul 2025 05:10:28 -0400
Subject: [PATCH 015/167] [Perf] Validate @config in pre-commit instead of
 dynamically (#20200)

Signed-off-by: Lionel Villard <villard@us.ibm.com>
---
 .pre-commit-config.yaml              |   7 ++
 tests/test_config.py                 |  35 +-----
 tests/tools/__init__.py              |   0
 tests/tools/test_config_validator.py |  49 +++++++++
 tools/validate_config.py             | 158 +++++++++++++++++++++++++++
 vllm/config.py                       |  28 +----
 6 files changed, 220 insertions(+), 57 deletions(-)
 create mode 100644 tests/tools/__init__.py
 create mode 100644 tests/tools/test_config_validator.py
 create mode 100644 tools/validate_config.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 15ef5defff69e..d962252eb3dd8 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -160,6 +160,13 @@ repos:
     types: [python]
     pass_filenames: false
     additional_dependencies: [pathspec, regex]
+  - id: validate-config
+    name: Validate configuration has default values and that each field has a docstring
+    entry: python tools/validate_config.py
+    language: python
+    types: [python]
+    pass_filenames: true
+    files: vllm/config.py|tests/test_config.py
   # Keep `suggestion` last
   - id: suggestion
     name: Suggestion
diff --git a/tests/test_config.py b/tests/test_config.py
index 5d5c4453d30d2..cb7654c26afc8 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -2,49 +2,16 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import MISSING, Field, asdict, dataclass, field
-from typing import Literal, Union
 
 import pytest
 
 from vllm.compilation.backends import VllmBackend
 from vllm.config import (LoadConfig, ModelConfig, PoolerConfig, VllmConfig,
-                         config, get_field)
+                         get_field)
 from vllm.model_executor.layers.pooler import PoolingType
 from vllm.platforms import current_platform
 
 
-class _TestConfig1:
-    pass
-
-
-@dataclass
-class _TestConfig2:
-    a: int
-    """docstring"""
-
-
-@dataclass
-class _TestConfig3:
-    a: int = 1
-
-
-@dataclass
-class _TestConfig4:
-    a: Union[Literal[1], Literal[2]] = 1
-    """docstring"""
-
-
-@pytest.mark.parametrize(("test_config", "expected_error"), [
-    (_TestConfig1, "must be a dataclass"),
-    (_TestConfig2, "must have a default"),
-    (_TestConfig3, "must have a docstring"),
-    (_TestConfig4, "must use a single Literal"),
-])
-def test_config(test_config, expected_error):
-    with pytest.raises(Exception, match=expected_error):
-        config(test_config)
-
-
 def test_compile_config_repr_succeeds():
     # setup: VllmBackend mutates the config object
     config = VllmConfig()
diff --git a/tests/tools/__init__.py b/tests/tools/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/tools/test_config_validator.py b/tests/tools/test_config_validator.py
new file mode 100644
index 0000000000000..b0475894a114e
--- /dev/null
+++ b/tests/tools/test_config_validator.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import ast
+
+import pytest
+
+from tools.validate_config import validate_ast
+
+_TestConfig1 = '''
+@config
+class _TestConfig1:
+    pass
+'''
+
+_TestConfig2 = '''
+@config
+@dataclass
+class _TestConfig2:
+    a: int
+    """docstring"""
+'''
+
+_TestConfig3 = '''
+@config
+@dataclass
+class _TestConfig3:
+    a: int = 1
+'''
+
+_TestConfig4 = '''
+@config
+@dataclass
+class _TestConfig4:
+    a: Union[Literal[1], Literal[2]] = 1
+    """docstring"""
+'''
+
+
+@pytest.mark.parametrize(("test_config", "expected_error"), [
+    (_TestConfig1, "must be a dataclass"),
+    (_TestConfig2, "must have a default"),
+    (_TestConfig3, "must have a docstring"),
+    (_TestConfig4, "must use a single Literal"),
+])
+def test_config(test_config, expected_error):
+    tree = ast.parse(test_config)
+    with pytest.raises(Exception, match=expected_error):
+        validate_ast(tree)
diff --git a/tools/validate_config.py b/tools/validate_config.py
new file mode 100644
index 0000000000000..8b1e955c653d7
--- /dev/null
+++ b/tools/validate_config.py
@@ -0,0 +1,158 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Ensures all fields in a config dataclass have default values
+and that each field has a docstring.
+"""
+
+import ast
+import inspect
+import sys
+
+
+def get_attr_docs(cls_node: ast.ClassDef) -> dict[str, str]:
+    """
+    Get any docstrings placed after attribute assignments in a class body.
+
+    Adapted from https://davidism.com/attribute-docstrings/
+    https://davidism.com/mit-license/
+    """
+
+    def pairwise(iterable):
+        """
+        Manually implement https://docs.python.org/3/library/itertools.html#itertools.pairwise
+
+        Can be removed when Python 3.9 support is dropped.
+        """
+        iterator = iter(iterable)
+        a = next(iterator, None)
+
+        for b in iterator:
+            yield a, b
+            a = b
+
+    out = {}
+
+    # Consider each pair of nodes.
+    for a, b in pairwise(cls_node.body):
+        # Must be an assignment then a constant string.
+        if (not isinstance(a, (ast.Assign, ast.AnnAssign))
+                or not isinstance(b, ast.Expr)
+                or not isinstance(b.value, ast.Constant)
+                or not isinstance(b.value.value, str)):
+            continue
+
+        doc = inspect.cleandoc(b.value.value)
+
+        # An assignment can have multiple targets (a = b = v), but an
+        # annotated assignment only has one target.
+        targets = a.targets if isinstance(a, ast.Assign) else [a.target]
+
+        for target in targets:
+            # Must be assigning to a plain name.
+            if not isinstance(target, ast.Name):
+                continue
+
+            out[target.id] = doc
+
+    return out
+
+
+class ConfigValidator(ast.NodeVisitor):
+
+    def __init__(self):
+        ...
+
+    def visit_ClassDef(self, node):
+        # Validate class with both @config and @dataclass decorators
+        decorators = [
+            id for d in node.decorator_list if (isinstance(d, ast.Name) and (
+                (id := d.id) == 'config' or id == 'dataclass')) or
+            (isinstance(d, ast.Call) and (isinstance(d.func, ast.Name) and
+                                          (id := d.func.id) == 'dataclass'))
+        ]
+
+        if set(decorators) == {'config', 'dataclass'}:
+            validate_class(node)
+        elif set(decorators) == {'config'}:
+            fail(
+                f"Class {node.name} with config decorator must be a dataclass.",
+                node)
+
+        self.generic_visit(node)
+
+
+def validate_class(class_node: ast.ClassDef):
+    attr_docs = get_attr_docs(class_node)
+
+    for stmt in class_node.body:
+        # A field is defined as a class variable that has a type annotation.
+        if isinstance(stmt, ast.AnnAssign):
+            # Skip ClassVar
+            # see https://docs.python.org/3/library/dataclasses.html#class-variables
+            if isinstance(stmt.annotation, ast.Subscript) and isinstance(
+                    stmt.annotation.value,
+                    ast.Name) and stmt.annotation.value.id == "ClassVar":
+                continue
+
+            if isinstance(stmt.target, ast.Name):
+                field_name = stmt.target.id
+                if stmt.value is None:
+                    fail(
+                        f"Field '{field_name}' in {class_node.name} must have "
+                        "a default value.", stmt)
+
+                if field_name not in attr_docs:
+                    fail(
+                        f"Field '{field_name}' in {class_node.name} must have "
+                        "a docstring.", stmt)
+
+                if isinstance(stmt.annotation, ast.Subscript) and \
+                   isinstance(stmt.annotation.value, ast.Name) \
+                    and stmt.annotation.value.id == "Union" and \
+                        isinstance(stmt.annotation.slice, ast.Tuple):
+                    args = stmt.annotation.slice.elts
+                    literal_args = [
+                        arg for arg in args
+                        if isinstance(arg, ast.Subscript) and isinstance(
+                            arg.value, ast.Name) and arg.value.id == "Literal"
+                    ]
+                    if len(literal_args) > 1:
+                        fail(
+                            f"Field '{field_name}' in {class_node.name} must "
+                            "use a single "
+                            "Literal type. Please use 'Literal[Literal1, "
+                            "Literal2]' instead of 'Union[Literal1, Literal2]'"
+                            ".", stmt)
+
+
+def validate_ast(tree: ast.stmt):
+    ConfigValidator().visit(tree)
+
+
+def validate_file(file_path: str):
+    try:
+        print(f"validating {file_path} config dataclasses ", end="")
+        with open(file_path, encoding="utf-8") as f:
+            source = f.read()
+
+        tree = ast.parse(source, filename=file_path)
+        validate_ast(tree)
+    except ValueError as e:
+        print(e)
+        SystemExit(2)
+    else:
+        print("✅")
+
+
+def fail(message: str, node: ast.stmt):
+    raise ValueError(f"❌ line({node.lineno}): {message}")
+
+
+def main():
+    for filename in sys.argv[1:]:
+        validate_file(filename)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm/config.py b/vllm/config.py
index 46a5bf34f66e4..6412e6e293b45 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -18,7 +18,7 @@ from functools import cached_property
 from importlib.util import find_spec
 from pathlib import Path
 from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Literal, Optional,
-                    Protocol, TypeVar, Union, cast, get_args, get_origin)
+                    Protocol, TypeVar, Union, cast, get_args)
 
 import regex as re
 import torch
@@ -193,28 +193,10 @@ def config(cls: ConfigT) -> ConfigT:
     (i.e. `ConfigT(**json.loads(cli_arg))`). However, if a particular `ConfigT`
     requires custom construction from CLI (i.e. `CompilationConfig`), it can
     have a `from_cli` method, which will be called instead.
+
+    Config validation is performed by the tools/validate_config.py
+    script, which is invoked during the pre-commit checks.
     """
-    if not is_dataclass(cls):
-        raise TypeError("The decorated class must be a dataclass.")
-    attr_docs = get_attr_docs(cls)
-    for f in fields(cls):
-        if f.init and f.default is MISSING and f.default_factory is MISSING:
-            raise ValueError(
-                f"Field '{f.name}' in {cls.__name__} must have a default value."
-            )
-
-        if f.name not in attr_docs:
-            raise ValueError(
-                f"Field '{f.name}' in {cls.__name__} must have a docstring.")
-
-        if get_origin(f.type) is Union:
-            args = get_args(f.type)
-            literal_args = [arg for arg in args if get_origin(arg) is Literal]
-            if len(literal_args) > 1:
-                raise ValueError(
-                    f"Field '{f.name}' in {cls.__name__} must use a single "
-                    "Literal type. Please use 'Literal[Literal1, Literal2]' "
-                    "instead of 'Union[Literal1, Literal2]'.")
     return cls
 
 
@@ -1798,7 +1780,7 @@ class ParallelConfig:
     eplb_step_interval: int = 3000
     """
     Interval for rearranging experts in expert parallelism.
-    
+
     Note that if this is greater than the EPLB window size, only the metrics
     of the last `eplb_window_size` steps will be used for rearranging experts.
     """

From 9025a9a7050253678431b2c20e6dd0be55f0dcc2 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Tue, 1 Jul 2025 06:20:34 -0400
Subject: [PATCH 016/167] [Quant] [Bugfix] Fix quantization config matching
 with `hf_to_vllm_mapper` (#20046)

---
 .../test_register_quantization_config.py      |  1 +
 vllm/lora/models.py                           |  2 +-
 vllm/lora/worker_manager.py                   |  5 +---
 .../layers/quantization/base_config.py        | 13 +++++++++++
 .../layers/quantization/bitblas.py            |  1 +
 .../compressed_tensors/compressed_tensors.py  | 17 +++++++++++++-
 .../model_executor/layers/quantization/fp8.py | 10 +++++++-
 .../layers/quantization/gptq_bitblas.py       |  1 +
 .../layers/quantization/marlin.py             |  2 ++
 .../layers/quantization/modelopt.py           |  1 +
 .../layers/quantization/torchao.py            |  1 +
 vllm/model_executor/model_loader/utils.py     | 22 ++++++++++--------
 vllm/model_executor/models/interfaces.py      | 23 ++++++++++++++++---
 vllm/model_executor/models/qwen2_5_vl.py      | 14 +++++------
 vllm/model_executor/models/transformers.py    |  1 +
 vllm/model_executor/models/utils.py           | 15 +++++++++++-
 vllm/model_executor/utils.py                  |  7 ++++--
 17 files changed, 107 insertions(+), 29 deletions(-)

diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py
index 42081a8c68cdc..6c541fdbeeae2 100644
--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@@ -53,6 +53,7 @@ class CustomQuantConfig(QuantizationConfig):
 
     def __init__(self, num_bits: int = 8) -> None:
         """Initialize the quantization config."""
+        super().__init__()
         self.num_bits = num_bits
 
     def get_name(self) -> QuantizationMethods:
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 262e6799583ae..9e1ed3a771798 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -805,7 +805,7 @@ def create_lora_manager(
         lora_manager_cls: type[LoRAModelManager] = LoRAModelManager,
         **kwargs) -> LoRAModelManager:
     """Create a LoRA adapter for a given model."""
-    if not hasattr(model, "packed_modules_mapping"):
+    if not isinstance(model, SupportsLoRA):
         raise ValueError(f"Model {type(model)} is not supported for LoRA.")
     lora_manager = lora_manager_cls(
         model=model,
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 7da44569f4086..7a4af74cbeb12 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -111,10 +111,7 @@ class WorkerLoRAManager(AbstractWorkerManager):
             # For some models like Qwen2VL, we need to use hf_to_vllm_mapper
             # to ensure correct loading of lora weights.
             model = self._adapter_manager.model
-            hf_to_vllm_mapper = None
-            if (hasattr(model, "hf_to_vllm_mapper")
-                    and model.hf_to_vllm_mapper is not None):
-                hf_to_vllm_mapper = model.hf_to_vllm_mapper
+            hf_to_vllm_mapper = getattr(model, "hf_to_vllm_mapper", None)
 
             lora = self._lora_model_cls.from_local_checkpoint(
                 lora_path,
diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index 78c5c75c06515..4a43351260e9f 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -10,6 +10,7 @@ from torch import nn
 
 if TYPE_CHECKING:
     from vllm.model_executor.layers.quantization import QuantizationMethods
+    from vllm.model_executor.models.utils import WeightsMapper
 else:
     QuantizationMethods = str
 
@@ -149,3 +150,15 @@ class QuantizationConfig(ABC):
 
     def get_cache_scale(self, name: str) -> Optional[str]:
         return None
+
+    def apply_vllm_mapper(  # noqa: B027
+            self, hf_to_vllm_mapper: "WeightsMapper"):
+        """
+        Interface for models to update module names referenced in
+        quantization configs in order to reflect the vllm model structure
+
+        :param hf_to_vllm_mapper: maps from hf model structure (the assumed
+            structure of the qconfig) to vllm model structure
+        """
+        # TODO (@kylesayrs): add implementations for all subclasses
+        pass
diff --git a/vllm/model_executor/layers/quantization/bitblas.py b/vllm/model_executor/layers/quantization/bitblas.py
index 9e5ce39ec8f2e..aa8eee88a9f9e 100644
--- a/vllm/model_executor/layers/quantization/bitblas.py
+++ b/vllm/model_executor/layers/quantization/bitblas.py
@@ -63,6 +63,7 @@ class BitBLASConfig(QuantizationConfig):
             # (since we have only one group per output channel)
             desc_act = False
 
+        super().__init__()
         self.weight_bits = weight_bits
         self.group_size = group_size
         self.desc_act = desc_act
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 4f87b2a44f0ac..e7f65d13181d8 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import suppress
-from typing import Any, Literal, Optional, cast
+from typing import TYPE_CHECKING, Any, Literal, Optional, cast
 
 import torch
 from compressed_tensors.config import (CompressionFormat,
@@ -37,6 +37,9 @@ from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import
     cutlass_fp4_supported)
 from vllm.platforms import current_platform
 
+if TYPE_CHECKING:
+    from vllm.model_executor.models.utils import WeightsMapper
+
 logger = init_logger(__name__)
 
 __all__ = ["CompressedTensorsLinearMethod"]
@@ -80,6 +83,18 @@ class CompressedTensorsConfig(QuantizationConfig):
     def get_name(self) -> QuantizationMethods:
         return "compressed-tensors"
 
+    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
+        self.target_scheme_map = hf_to_vllm_mapper.apply_dict(
+            self.target_scheme_map)
+        self.ignore = hf_to_vllm_mapper.apply_list(self.ignore)
+        self.sparsity_scheme_map = hf_to_vllm_mapper.apply_dict(
+            self.sparsity_scheme_map)
+        self.sparsity_ignore_list = hf_to_vllm_mapper.apply_list(
+            self.sparsity_ignore_list)
+        if self.kv_cache_scheme is not None:
+            self.kv_cache_scheme = hf_to_vllm_mapper.apply_dict(
+                self.kv_cache_scheme)
+
     def get_quant_method(
         self,
         layer: torch.nn.Module,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 93472207fbb86..60df679a74bda 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import functools
-from typing import Any, Callable, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -39,6 +39,9 @@ from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 from vllm.utils import has_deep_gemm
 
+if TYPE_CHECKING:
+    from vllm.model_executor.models.utils import WeightsMapper
+
 ACTIVATION_SCHEMES = ["static", "dynamic"]
 
 logger = init_logger(__name__)
@@ -100,6 +103,11 @@ class Fp8Config(QuantizationConfig):
     def get_config_filenames(cls) -> list[str]:
         return []
 
+    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
+        if self.ignored_layers is not None:
+            self.ignored_layers = hf_to_vllm_mapper.apply_list(
+                self.ignored_layers)
+
     @classmethod
     def from_config(cls, config: dict[str, Any]) -> "Fp8Config":
         quant_method = cls.get_from_keys(config, ["quant_method"])
diff --git a/vllm/model_executor/layers/quantization/gptq_bitblas.py b/vllm/model_executor/layers/quantization/gptq_bitblas.py
index 78e0f59fa4bee..caeb266d0b933 100644
--- a/vllm/model_executor/layers/quantization/gptq_bitblas.py
+++ b/vllm/model_executor/layers/quantization/gptq_bitblas.py
@@ -81,6 +81,7 @@ class GPTQBitBLASConfig(QuantizationConfig):
             # (since we have only one group per output channel)
             desc_act = False
 
+        super().__init__()
         self.weight_bits = weight_bits
         self.group_size = group_size
         self.desc_act = desc_act
diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py
index 62667db26b669..18d1c13373df9 100644
--- a/vllm/model_executor/layers/quantization/marlin.py
+++ b/vllm/model_executor/layers/quantization/marlin.py
@@ -32,6 +32,8 @@ class MarlinConfig(QuantizationConfig):
         group_size: int,
         lm_head_quantized: bool,
     ) -> None:
+        super().__init__()
+
         # Group size for the quantization.
         self.group_size = group_size
         self.lm_head_quantized = lm_head_quantized
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index e35db5b31dba7..a10911b84afc4 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -181,6 +181,7 @@ class ModelOptNvFp4Config(QuantizationConfig):
         exclude_modules: list[str],
         group_size: int = 16,
     ) -> None:
+        super().__init__()
         self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized
         if is_checkpoint_nvfp4_serialized:
             logger.warning(
diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py
index a4e0356c02689..63b2ab6bab063 100644
--- a/vllm/model_executor/layers/quantization/torchao.py
+++ b/vllm/model_executor/layers/quantization/torchao.py
@@ -55,6 +55,7 @@ class TorchAOConfig(QuantizationConfig):
             os.environ["VLLM_DISABLE_COMPILE_CACHE"] = "1"
             logger.info("Using TorchAO: Setting VLLM_DISABLE_COMPILE_CACHE=1")
         """
+        super().__init__()
         self.torchao_config = torchao_config
         self.skip_modules = skip_modules or []
 
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 79e6fa7b16dc7..159e7b1e6b01a 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -24,6 +24,7 @@ from vllm.model_executor.models import ModelRegistry
 from vllm.model_executor.models.adapters import (as_classification_model,
                                                  as_embedding_model,
                                                  as_reward_model)
+from vllm.model_executor.models.interfaces import SupportsQuant
 from vllm.utils import is_pin_memory_available
 
 logger = init_logger(__name__)
@@ -294,13 +295,16 @@ def configure_quant_config(quant_config: QuantizationConfig,
 
     Note that model attributes are passed by reference to quant_config,
     enabling them to be updated by model_class.__new__ (ex. chatglm, qwen)
+
+    Once the `SupportsQuant` mixin has been added to all models, this
+    function can be removed
     """
-    packed_mapping = getattr(model_class, "packed_modules_mapping", None)
-    if packed_mapping is not None:
-        # pass packed_modules_mapping by reference to quant_config
-        quant_config.packed_modules_mapping = packed_mapping
-    else:
-        logger.warning(
-            "The model class %s has not defined `packed_modules_mapping`, "
-            "this may lead to incorrect mapping of quantized or ignored "
-            "modules", model_class.__name__)
+    if not issubclass(model_class, SupportsQuant):
+        hf_to_vllm_mapper = getattr(model_class, "hf_to_vllm_mapper", None)
+        packed_mapping = getattr(model_class, "packed_modules_mapping", None)
+
+        # pass mappings by reference to quant_config
+        if hf_to_vllm_mapper is not None:
+            quant_config.apply_vllm_mapper(hf_to_vllm_mapper)
+        if packed_mapping is not None:
+            quant_config.packed_modules_mapping = packed_mapping
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index ad59fe79edcb1..d234632ef1b75 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -18,6 +18,7 @@ from .interfaces_base import is_pooling_model
 
 if TYPE_CHECKING:
     from vllm.attention import AttentionMetadata
+    from vllm.model_executor.models.utils import WeightsMapper
     from vllm.sequence import IntermediateTensors
 
 logger = init_logger(__name__)
@@ -566,20 +567,36 @@ def has_step_pooler(model: Union[type[object], object]) -> bool:
 class SupportsQuant:
     """The interface required for all models that support quantization."""
 
-    packed_modules_mapping: ClassVar[dict[str, list[str]]] = {}
+    hf_to_vllm_mapper: ClassVar[Optional["WeightsMapper"]] = None
+    packed_modules_mapping: ClassVar[Optional[dict[str, list[str]]]] = None
     quant_config: Optional[QuantizationConfig] = None
 
     def __new__(cls, *args, **kwargs) -> Self:
         instance = super().__new__(cls)
+
+        # find config passed in arguments
         quant_config = cls._find_quant_config(*args, **kwargs)
         if quant_config is not None:
+
+            # attach config to model for general use
             instance.quant_config = quant_config
-            instance.quant_config.packed_modules_mapping.update(
-                cls.packed_modules_mapping)
+
+            # apply model mappings to config for proper config-model matching
+            # NOTE: `TransformersForCausalLM` is not supported due to how this
+            # class defines `hf_to_vllm_mapper` as a post-init `@property`.
+            # After this is fixed, get `instance.hf_to_vllm_mapper` directly
+            if getattr(instance, "hf_to_vllm_mapper", None) is not None:
+                instance.quant_config.apply_vllm_mapper(
+                    instance.hf_to_vllm_mapper)
+            if getattr(instance, "packed_modules_mapping", None) is not None:
+                instance.quant_config.packed_modules_mapping.update(
+                    instance.packed_modules_mapping)
+
         return instance
 
     @staticmethod
     def _find_quant_config(*args, **kwargs) -> Optional[QuantizationConfig]:
+        """Find quant config passed through model constructor args"""
         from vllm.config import VllmConfig  # avoid circular import
 
         args_values = list(args) + list(kwargs.values())
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index ff53a2775e3d4..1b64b61a1e5cf 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -61,7 +61,7 @@ from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP)
+                         SupportsMultiModal, SupportsPP, SupportsQuant)
 from .qwen2_vl import Qwen2VLDummyInputsBuilder as Qwen2_5_VLDummyInputsBuilder
 from .qwen2_vl import (Qwen2VLMultiModalProcessor, Qwen2VLProcessingInfo,
                        apply_rotary_pos_emb_vision)
@@ -821,7 +821,8 @@ class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor):
     info=Qwen2_5_VLProcessingInfo,
     dummy_inputs=Qwen2_5_VLDummyInputsBuilder)
 class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                         SupportsLoRA, SupportsPP):
+                                         SupportsLoRA, SupportsPP,
+                                         SupportsQuant):
 
     # To ensure correct weight loading and mapping.
     hf_to_vllm_mapper = WeightsMapper(
@@ -837,7 +838,6 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
 
         self.config = config
@@ -846,7 +846,7 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.visual = Qwen2_5_VisionTransformer(
             config.vision_config,
             norm_eps=getattr(config, "rms_norm_eps", 1e-6),
-            quant_config=self._maybe_ignore_quant_config(quant_config),
+            quant_config=self._maybe_ignore_quant_config(self.quant_config),
             prefix=maybe_prefix(prefix, "visual"),
         )
 
@@ -859,12 +859,12 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
+    def _maybe_ignore_quant_config(self, config: Optional[QuantizationConfig]):
         # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
         # seems to avoid vision encoder sections for some models.
-        if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
+        if isinstance(config, (GPTQConfig, GPTQMarlinConfig)):
             return None
-        return quant_config
+        return config
 
     def _validate_and_reshape_mm_tensor(self, mm_input: object,
                                         name: str) -> torch.Tensor:
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 2f78d9d4cc065..04ee3a454f9d8 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -467,6 +467,7 @@ class TransformersForCausalLM(nn.Module, SupportsQuant, SupportsLoRA,
     # FIXME(Isotr0py): Don't use any weights mapper for Transformers backend,
     # this makes thing complicated. We need to remove this mapper after refactor
     # `TransformersModel` in the future.
+    # NOTE: `SupportsQuant` can be updated after property decorator is removed
     @property
     def hf_to_vllm_mapper(self):
         prefix_mapper = {
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index aa88f42101605..62deb68035b92 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -4,7 +4,7 @@
 import itertools
 from collections.abc import Iterable, Mapping
 from dataclasses import dataclass, field
-from typing import Callable, Literal, Optional, Protocol, Union, overload
+from typing import Any, Callable, Literal, Optional, Protocol, Union, overload
 
 import torch
 import torch.nn as nn
@@ -64,6 +64,19 @@ class WeightsMapper:
         return ((out_name, data) for name, data in weights
                 if (out_name := self._map_name(name)) is not None)
 
+    def apply_list(self, values: list[str]) -> list[str]:
+        return [
+            out_name for name in values
+            if (out_name := self._map_name(name)) is not None
+        ]
+
+    def apply_dict(self, values: dict[str, Any]) -> dict[str, Any]:
+        return {
+            out_name: value
+            for name, value in values.items()
+            if (out_name := self._map_name(name)) is not None
+        }
+
 
 class AutoWeightsLoader:
     """
diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index cbaa34bfc30b2..2b20ca2a3ba3f 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -58,7 +58,8 @@ def _make_synced_weight_loader(original_weight_loader):
 
 
 def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]:
-    parent_map = copy.deepcopy(getattr(model, "packed_modules_mapping", {}))
+    parent_map = getattr(model, "packed_modules_mapping", None)
+    parent_map = copy.deepcopy(parent_map) if parent_map is not None else {}
 
     # don't infer mapping if the model has defined it explicitly.
     if parent_map:
@@ -66,7 +67,9 @@ def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]:
 
     # We only check main components instead of whole model submodules
     for child in model.children():
-        child_map = getattr(child, "packed_modules_mapping", {})
+        child_map = getattr(child, "packed_modules_mapping", None)
+        child_map = copy.deepcopy(child_map) if child_map is not None else {}
+
         if any((k in parent_map and parent_map[k] != v)
                for k, v in child_map.items()):
             raise ValueError(

From 650d5dbd04e92f5043a11e4a4d86d4f39ee1b694 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Tue, 1 Jul 2025 13:40:14 +0200
Subject: [PATCH 017/167] [Misc] Minor refactor of NIXL background handshake
 (#20068)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 .../kv_connector/v1/nixl_connector.py         | 60 ++++++++++---------
 1 file changed, 32 insertions(+), 28 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 7a077dce7706c..56ae1acf8571f 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -515,6 +515,33 @@ class NixlConnectorWorker:
         # Remote rank -> agent name.
         return {p_remote_rank: handshake(path, p_remote_rank)}
 
+    def _background_nixl_handshake(self, req_id: str,
+                                   remote_engine_id: EngineId, meta: ReqMeta):
+        # Do NIXL handshake in background and add to _ready_requests when done.
+        fut = self._handshake_futures.get(remote_engine_id)
+        if fut is None:
+            fut = self._handshake_initiation_executor.submit(
+                self._nixl_handshake, meta.remote_host, meta.remote_port,
+                meta.tp_size)
+            self._handshake_futures[remote_engine_id] = fut
+
+            def done_callback(f: Future[dict[int, str]], eid=remote_engine_id):
+                with self._handshake_lock:
+                    del self._handshake_futures[eid]
+                    try:
+                        self._remote_agents[eid] = f.result()
+                    except Exception:
+                        logger.exception("Handshake with %s failed", eid)
+
+            fut.add_done_callback(done_callback)
+
+        # TODO: handle failure state of future in the
+        # callback, we want to fail the request in this case.
+        def request_ready(_f: Future[Any], entry=(req_id, meta)):
+            self._ready_requests.put(entry)
+
+        fut.add_done_callback(request_ready)
+
     def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         """Register the KV Cache data in nixl."""
 
@@ -902,37 +929,14 @@ class NixlConnectorWorker:
                 remote_engine_id, len(meta.local_block_ids),
                 len(meta.remote_block_ids))
             if remote_engine_id not in self._remote_agents:
-                # Being optimistic to assume engine is usually ready, apply
-                # lock only when the optimistic check fails.
+                # Initiate handshake with remote engine to exchange metadata.
                 with self._handshake_lock:
                     if remote_engine_id not in self._remote_agents:
-                        fut = self._handshake_futures.get(remote_engine_id)
-                        if fut is None:
-                            fut = self._handshake_initiation_executor.submit(
-                                self._nixl_handshake, meta.remote_host,
-                                meta.remote_port, meta.tp_size)
-                            self._handshake_futures[remote_engine_id] = fut
-
-                            def done_callback(f: Future[dict[int, str]],
-                                              eid=remote_engine_id):
-                                with self._handshake_lock:
-                                    del self._handshake_futures[eid]
-                                    try:
-                                        self._remote_agents[eid] = f.result()
-                                    except Exception:
-                                        logger.exception(
-                                            "Handshake with %s failed", eid)
-
-                            fut.add_done_callback(done_callback)
-
-                        # TODO: handle failure state of future in the
-                        # callback, we want to fail the request in this case.
-                        def request_ready(_f: Future[Any],
-                                          entry=(req_id, meta)):
-                            self._ready_requests.put(entry)
-
-                        fut.add_done_callback(request_ready)
+                        self._background_nixl_handshake(
+                            req_id, remote_engine_id, meta)
                         continue
+
+            # Handshake already completed, start async read xfer.
             self._read_blocks_for_req(req_id, meta)
 
         # Start transfers for requests whose handshakes have now finished.

From ed70f3c64f684750edea087e286cbf264e7cc3f3 Mon Sep 17 00:00:00 2001
From: Yuxuan Zhang <2448370773@qq.com>
Date: Tue, 1 Jul 2025 20:48:26 +0800
Subject: [PATCH 018/167] Add GLM4.1V model (Draft) (#19331)

Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 docs/models/supported_models.md               |    3 +-
 examples/offline_inference/vision_language.py |   40 +-
 tests/entrypoints/openai/test_video.py        |    2 +-
 .../multimodal/generation/test_common.py      |   28 +
 .../generation/vlm_utils/custom_inputs.py     |   20 +
 .../generation/vlm_utils/model_utils.py       |   24 +
 .../multimodal/processing/test_common.py      |   24 +
 tests/models/registry.py                      |    1 +
 tests/multimodal/test_utils.py                |    4 +-
 vllm/assets/video.py                          |   26 +-
 vllm/entrypoints/chat_utils.py                |    4 +
 .../model_executor/layers/rotary_embedding.py |  119 ++
 vllm/model_executor/models/glm4_1v.py         | 1589 +++++++++++++++++
 vllm/model_executor/models/registry.py        |    1 +
 vllm/multimodal/inputs.py                     |    8 +-
 vllm/multimodal/parse.py                      |   42 +-
 vllm/multimodal/video.py                      |   27 +-
 17 files changed, 1946 insertions(+), 16 deletions(-)
 create mode 100644 vllm/model_executor/models/glm4_1v.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 0248700292ae2..db650b37a38db 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -553,6 +553,7 @@ Specified using `--task generate`.
 | `FuyuForCausalLM`                            | Fuyu                                                                     | T + I                                                                 | `adept/fuyu-8b` etc.                                                                                                                                    |                        | ✅︎                          | ✅︎                    |
 | `Gemma3ForConditionalGeneration`             | Gemma 3                                                                  | T + I<sup>+</sup>                                                     | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.                                                                                                   | ✅︎                     | ✅︎                          | ⚠️                    |
 | `GLM4VForCausalLM`<sup>^</sup>               | GLM-4V                                                                   | T + I                                                                 | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220` etc.                                                                                                    | ✅︎                     | ✅︎                          | ✅︎                    |
+| `Glm4vForConditionalGeneration`              | GLM-4.1V-Thinking                                                        | T + I<sup>E+</sup> + V<sup>E+</sup>                                   | `THUDM/GLM-4.1V-9B-Thinkg`,  etc.                                                                                                                       | ✅︎                     | ✅︎                          | ✅︎                    |
 | `GraniteSpeechForConditionalGeneration`      | Granite Speech                                                           | T + A                                                                 | `ibm-granite/granite-speech-3.3-8b`                                                                                                                     | ✅︎                     | ✅︎                          | ✅︎                    |
 | `H2OVLChatModel`                             | H2OVL                                                                    | T + I<sup>E+</sup>                                                    | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.                                                                                      |                        | ✅︎                          | ✅︎\*                  |
 | `Idefics3ForConditionalGeneration`           | Idefics3                                                                 | T + I                                                                 | `HuggingFaceM4/Idefics3-8B-Llama3` etc.                                                                                                                 | ✅︎                     |                             |  ✅︎                   |
@@ -583,7 +584,7 @@ Specified using `--task generate`.
 | `SkyworkR1VChatModel`                        | Skywork-R1V-38B                                                          | T + I                                                                 | `Skywork/Skywork-R1V-38B`                                                                                                                               |                        | ✅︎                          | ✅︎                    |
 | `SmolVLMForConditionalGeneration`            | SmolVLM2                                                                 | T + I                                                                 | `SmolVLM2-2.2B-Instruct`                                                                                                                                | ✅︎                     |                             | ✅︎                    |
 | `TarsierForConditionalGeneration`            | Tarsier                                                                  | T + I<sup>E+</sup>                                                    | `omni-search/Tarsier-7b`,`omni-search/Tarsier-34b`                                                                                                      |                        | ✅︎                          | ✅︎                    |
-| `Tarsier2ForConditionalGeneration`<sup>^</sup>            | Tarsier2                                                                  | T + I<sup>E+</sup> + V<sup>E+</sup>                                                    | `omni-research/Tarsier2-Recap-7b`,`omni-research/Tarsier2-7b-0115`                                                                                                      |                        | ✅︎                          | ✅︎                    |
+| `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2                                                                 | T + I<sup>E+</sup> + V<sup>E+</sup>                                                | `omni-research/Tarsier2-Recap-7b`,`omni-research/Tarsier2-7b-0115`                                                                                      |                        | ✅︎                          | ✅︎                    |
 
 <sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.  
 &nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:  
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 57b042ed013b1..b9e8bef26eb24 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -248,6 +248,42 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# GLM-4.1V
+def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "THUDM/GLM-4.1V-9B-Thinking"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        mm_processor_kwargs={
+            "size": {"shortest_edge": 12544, "longest_edge": 47040000},
+            "fps": 1,
+        },
+        limit_mm_per_prompt={modality: 1},
+        enforce_eager=True,
+    )
+
+    if modality == "image":
+        placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
+    elif modality == "video":
+        placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
+
+    prompts = [
+        (
+            "[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n"
+            f"{placeholder}"
+            f"{question}<|assistant|>assistant\n"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # H2OVL-Mississippi
 def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1114,6 +1150,7 @@ model_example_map = {
     "fuyu": run_fuyu,
     "gemma3": run_gemma3,
     "glm4v": run_glm4v,
+    "glm4_1v": run_glm4_1v,
     "h2ovl_chat": run_h2ovl,
     "idefics3": run_idefics3,
     "internvl_chat": run_internvl,
@@ -1172,10 +1209,11 @@ def get_multi_modal_input(args):
     if args.modality == "video":
         # Input video and question
         video = VideoAsset(name="baby_reading", num_frames=args.num_frames).np_ndarrays
+        metadata = VideoAsset(name="baby_reading", num_frames=args.num_frames).metadata
         vid_questions = ["Why is this video funny?"]
 
         return {
-            "data": video,
+            "data": [(video, metadata)] if args.model_type == "glm4_1v" else video,
             "questions": vid_questions,
         }
 
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index 990ea3579291d..b68e08556ee96 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -50,7 +50,7 @@ async def client(server):
 @pytest.fixture(scope="session")
 def base64_encoded_video() -> dict[str, str]:
     return {
-        video_url: encode_video_base64(fetch_video(video_url))
+        video_url: encode_video_base64(fetch_video(video_url)[0])
         for video_url in TEST_VIDEO_URLS
     }
 
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 9d63339737ce6..6ecf6db56cb39 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -309,6 +309,34 @@ VLM_TEST_SETTINGS = {
         num_logprobs=10,
         marks=[large_gpu_mark(min_gb=32)],
     ),
+    "glm4_1v": VLMTestInfo(
+        models=["THUDM/GLM-4.1V-9B-Thinking"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501
+        max_model_len=2048,
+        max_num_seqs=2,
+        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
+        num_logprobs=10,
+        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        auto_cls=AutoModelForImageTextToText,
+    ),
+    "glm4_1v-video": VLMTestInfo(
+        models=["THUDM/GLM-4.1V-9B-Thinking"],
+        # GLM4.1V require include video metadata for input
+        test_type=VLMTestType.CUSTOM_INPUTS,
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        patch_hf_runner=model_utils.glm4_1v_patch_hf_runner,
+        custom_test_opts=[CustomTestOptions(
+            inputs=custom_inputs.video_with_metadata_glm4_1v(),
+            limit_mm_per_prompt={"video": 1},
+        )],
+        # This is needed to run on machine with 24GB VRAM
+        vllm_runner_kwargs={"gpu_memory_utilization": 0.95},
+    ),
     "h2ovl": VLMTestInfo(
         models = [
             "h2oai/h2ovl-mississippi-800m",
diff --git a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
index aa5835243e042..c53243b42e384 100644
--- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
+++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
@@ -129,3 +129,23 @@ def windows_attention_image_qwen2_5_vl():
 
     wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
     return build_single_image_inputs([image], [prompt], wrapped_sf)
+
+
+def video_with_metadata_glm4_1v():
+    video_array = VIDEO_ASSETS[0].np_ndarrays
+    metadata = VIDEO_ASSETS[0].metadata
+    question = "Describe the video."
+    video_prompt = "<|begin_of_video|><|video|><|end_of_video|>"
+    formatted_prompt = f"<|user|>\n{video_prompt}{question}<|assistant|>\n"
+
+    scales = [0.1, 0.2, 0.25]
+    video_input = [[(rescale_video_size(video_array, scale), metadata)]
+                   for scale in scales]
+    prompts = [formatted_prompt] * len(video_input)
+
+    return [
+        PromptWithMultiModalInput(
+            prompts=prompts,
+            video_data=video_input,
+        )
+    ]
diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index af4c72f44b676..c1a2aa0dcafbb 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -16,9 +16,11 @@ import torch
 from PIL.Image import Image
 from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
                           GenerationConfig, GenerationMixin)
+from transformers.video_utils import VideoMetadata
 
 from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import patch_padding_side
+from vllm.utils import is_list_of
 
 from .....conftest import HfRunner, ImageAsset, ImageTestAssets
 from .types import RunnerOutput
@@ -373,6 +375,28 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     return hf_model
 
 
+def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for GLM4.1V."""
+    hf_processor = hf_model.processor
+
+    def processor(*args, videos=None, **kwargs):
+        if videos is not None and is_list_of(videos, tuple):
+            # If videos is a list of tuples, we assume each tuple contains
+            # (video_array, metadata) as in the case of GLM4.1V.
+            video_metadata = [[VideoMetadata(**video[1])] for video in videos]
+            videos = [[video[0]] for video in videos]
+        else:
+            video_metadata = None
+
+        return hf_processor(*args,
+                            videos=videos,
+                            video_metadata=video_metadata,
+                            **kwargs)
+
+    hf_model.processor = processor
+    return hf_model
+
+
 def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for H2OVL."""
 
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 1ba60178c13db..0f33225eda2da 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -24,6 +24,22 @@ from ....multimodal.utils import random_audio, random_image, random_video
 from ...registry import HF_EXAMPLE_MODELS
 
 
+def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
+    """
+    Patch the multimodal data for GLM4.1V model.
+    """
+    # Ensure video metadata is included
+    if "video" in mm_data:
+        video = mm_data["video"]
+        mm_data["video"] = (video, {
+            "total_num_frames": len(video),
+            "fps": len(video),
+            "duration": 1,
+            "video_backend": "opencv"
+        })
+    return mm_data
+
+
 def _test_processing_correctness(
     model_id: str,
     hit_rate: float,
@@ -154,6 +170,11 @@ _IGNORE_MM_KEYS = {
     "ultravox": {"audio_features"},
 }
 
+MM_DATA_PATCHES = {
+    # GLM4.1V requires video metadata to be included in the input
+    "glm4v": glm4_1v_patch_mm_data,
+}
+
 
 def _test_processing_correctness_one(
     model_config: ModelConfig,
@@ -166,6 +187,8 @@ def _test_processing_correctness_one(
 ):
     model_type = model_config.hf_config.model_type
     ignore_mm_keys = _IGNORE_MM_KEYS.get(model_type, set[str]())
+    if model_type in MM_DATA_PATCHES:
+        mm_data = MM_DATA_PATCHES[model_type](mm_data)
 
     if isinstance(prompt, str):
         text_prompt = prompt
@@ -245,6 +268,7 @@ def _test_processing_correctness_one(
     "adept/fuyu-8b",
     "google/gemma-3-4b-it",
     "THUDM/glm-4v-9b",
+    "THUDM/GLM-4.1V-9B-Thinking",
     "ibm-granite/granite-speech-3.3-2b",
     "h2oai/h2ovl-mississippi-800m",
     "OpenGVLab/InternVL2-1B",
diff --git a/tests/models/registry.py b/tests/models/registry.py
index e56dd19bec670..affe2e88b2b94 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -338,6 +338,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "GLM4VForCausalLM": _HfExamplesInfo("THUDM/glm-4v-9b",
                                         trust_remote_code=True,
                                         hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
+    "Glm4vForConditionalGeneration": _HfExamplesInfo("THUDM/GLM-4.1V-9B-Thinking", min_transformers_version="4.53"),  # noqa: E501
     "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
                                       extras={"2b": "h2oai/h2ovl-mississippi-2b"},  # noqa: E501
                                       max_transformers_version="4.48",  # noqa: E501
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 5ac0a90f50473..a48542cec3f87 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -172,7 +172,9 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
     video_sync = connector.fetch_video(video_url, num_frames=num_frames)
     video_async = await connector.fetch_video_async(video_url,
                                                     num_frames=num_frames)
-    assert np.array_equal(video_sync, video_async)
+    # Check that the video frames are equal and metadata are same
+    assert np.array_equal(video_sync[0], video_async[0])
+    assert video_sync[1] == video_async[1]
 
 
 # Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index 01834aeeb6c12..16412121cf0a8 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -3,7 +3,7 @@
 
 from dataclasses import dataclass
 from functools import lru_cache
-from typing import ClassVar, Literal, Optional
+from typing import Any, ClassVar, Literal, Optional
 
 import cv2
 import numpy as np
@@ -77,6 +77,24 @@ def video_to_pil_images_list(path: str,
     ]
 
 
+def video_get_metadata(path: str) -> dict[str, Any]:
+    cap = cv2.VideoCapture(path)
+    if not cap.isOpened():
+        raise ValueError(f"Could not open video file {path}")
+
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    duration = total_frames / fps if fps > 0 else 0
+
+    metadata = {
+        "total_num_frames": total_frames,
+        "fps": fps,
+        "duration": duration,
+        "video_backend": "opencv"
+    }
+    return metadata
+
+
 VideoAssetName = Literal["baby_reading"]
 
 
@@ -105,6 +123,12 @@ class VideoAsset:
         ret = video_to_ndarrays(video_path, self.num_frames)
         return ret
 
+    @property
+    def metadata(self) -> dict[str, Any]:
+        video_path = download_video_asset(self.filename)
+        ret = video_get_metadata(video_path)
+        return ret
+
     def get_audio(self, sampling_rate: Optional[float] = None) -> npt.NDArray:
         """
         Read audio data from the video asset, used in Qwen2.5-Omni examples.
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 35ee52ab4601d..45f1894d022b3 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -515,6 +515,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
         if modality in ("image", "image_embeds"):
             if model_type == "chatglm":
                 return "<|begin_of_image|><|endoftext|><|end_of_image|>"
+            if model_type == "glm4v":
+                return "<|begin_of_image|><|image|><|end_of_image|>"
             if model_type in ("phi3_v", "phi4mm"):
                 return f"<|image_{current_count}|>"
             if model_type in ("minicpmo", "minicpmv"):
@@ -563,6 +565,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
         elif modality == "video":
             if model_type == "internvl_chat":
                 return "<video>"
+            if model_type == "glm4v":
+                return "<|begin_of_video|><|video|><|end_of_video|>"
             if model_type in ("qwen2_vl", "qwen2_5_vl"):
                 return "<|vision_start|><|video_pad|><|vision_end|>"
             if model_type == "qwen2_5_omni":
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index b7bb2affc4fab..d84dad1885a3e 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -23,6 +23,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Rotary Positional Embeddings."""
+import itertools
 import math
 from typing import Any, Optional, Union
 
@@ -1118,6 +1119,15 @@ class MRotaryEmbedding(RotaryEmbedding):
                 audio_feature_lengths=audio_feature_lengths,
                 use_audio_in_video=use_audio_in_video,
             )
+        elif "glm4v" in hf_config.model_type:
+            return cls._glm4v_get_input_positions_tensor(
+                input_tokens=input_tokens,
+                hf_config=hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                context_len=context_len,
+                seq_len=seq_len,
+            )
         else:
             return cls._vl_get_input_positions_tensor(
                 input_tokens=input_tokens,
@@ -1129,6 +1139,115 @@ class MRotaryEmbedding(RotaryEmbedding):
                 seq_len=seq_len,
             )
 
+    @classmethod
+    def _glm4v_get_input_positions_tensor(
+        cls,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[list[list[int]], torch.Tensor],
+        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+    ) -> tuple[torch.Tensor, int]:
+        """Get mrope input positions and delta value for GLM4V."""
+
+        image_token_id = hf_config.image_token_id
+        video_start_token_id = hf_config.video_start_token_id
+        video_end_token_id = hf_config.video_end_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+        llm_pos_ids_list: list = []
+
+        if not (image_grid_thw is None and video_grid_thw is None):
+            if isinstance(image_grid_thw, torch.Tensor):
+                image_grid_thw = image_grid_thw.tolist()
+
+            input_token_type: list[str] = []
+            video_check_flg = False
+            for token in input_tokens:
+                if token == video_start_token_id:
+                    video_check_flg = True
+                elif token == video_end_token_id:
+                    video_check_flg = False
+
+                if (token == image_token_id) and (video_check_flg is False):
+                    input_token_type.append("image")
+                elif (token == image_token_id) and (video_check_flg is True):
+                    input_token_type.append("video")
+                else:
+                    input_token_type.append("text")
+
+            input_type_group: list[tuple[str, int, int]] = []
+            for key, group_iter in itertools.groupby(
+                    enumerate(input_token_type), lambda x: x[1]):
+                group_list = list(group_iter)
+                start_index = group_list[0][0]
+                end_index = group_list[-1][0] + 1
+                input_type_group.append((key, start_index, end_index))
+
+            video_frame_num = 1
+            mm_data_idx = 0
+            for modality_type, start_idx, end_idx in input_type_group:
+                st_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                    llm_pos_ids_list) > 0 else 0
+                if modality_type == "image":
+                    t, h, w = (
+                        image_grid_thw[mm_data_idx][0],
+                        image_grid_thw[mm_data_idx][1],
+                        image_grid_thw[mm_data_idx][2],
+                    )
+                    llm_grid_t, llm_grid_h, llm_grid_w = \
+                        t, h // spatial_merge_size, w // spatial_merge_size
+
+                    t_index = torch.arange(llm_grid_t).view(-1, 1).expand(
+                        -1, llm_grid_h * llm_grid_w).flatten()
+                    h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
+                        llm_grid_t, -1, llm_grid_w).flatten()
+                    w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
+                        llm_grid_t, llm_grid_h, -1).flatten()
+                    llm_pos_ids_list.append(
+                        torch.stack([t_index, h_index, w_index]) + st_idx)
+                    mm_data_idx += 1
+
+                elif modality_type == "video":
+                    t, h, w = (
+                        video_frame_num,
+                        image_grid_thw[mm_data_idx][1],
+                        image_grid_thw[mm_data_idx][2],
+                    )
+                    llm_grid_t, llm_grid_h, llm_grid_w = \
+                        t, h // spatial_merge_size, w // spatial_merge_size
+
+                    for t_idx in range(llm_grid_t):
+                        t_index = torch.tensor(t_idx).view(-1, 1).expand(
+                            -1, llm_grid_h * llm_grid_w).flatten()
+                        h_index = torch.arange(llm_grid_h).view(
+                            1, -1, 1).expand(1, -1, llm_grid_w).flatten()
+                        w_index = torch.arange(llm_grid_w).view(
+                            1, 1, -1).expand(1, llm_grid_h, -1).flatten()
+                        llm_pos_ids_list.append(
+                            torch.stack([t_index, h_index, w_index]) + st_idx)
+
+                    mm_data_idx += 1
+                    video_frame_num += 1
+
+                else:
+                    text_len = end_idx - start_idx
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len).view(1, -1).expand(3, -1) +
+                        st_idx)
+                    video_frame_num = 1
+
+        else:
+            text_len = len(input_tokens)
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1))
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        llm_positions = llm_positions[:, context_len:seq_len]
+        mrope_position_delta = (llm_positions.max() + 1 -
+                                len(input_tokens)).item()
+        return llm_positions, mrope_position_delta
+
     @classmethod
     def _vl_get_input_positions_tensor(
         cls,
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
new file mode 100644
index 0000000000000..3ac3341726dbe
--- /dev/null
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -0,0 +1,1589 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/Glm4v/modeling_Glm4v.py
+# Copyright 2025 The vLLM team.
+# Copyright 2025 The ZhipuAI Team.
+# Copyright 2025 The HuggingFace Inc. team.
+# All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only GLM-4V model compatible with HuggingFace weights."""
+
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from functools import partial
+from typing import Any, Callable, Literal, Optional, TypedDict, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers import BatchFeature
+from transformers.models.glm4v.configuration_glm4v import (Glm4vConfig,
+                                                           Glm4vVisionConfig)
+from transformers.models.glm4v.image_processing_glm4v import (
+    Glm4vImageProcessor, smart_resize)
+from transformers.models.glm4v.video_processing_glm4v import (
+    Glm4vVideoProcessor)
+from transformers.video_utils import VideoMetadata
+
+from vllm.config import VllmConfig
+from vllm.distributed import parallel_state
+from vllm.distributed import utils as dist_utils
+from vllm.logger import init_logger
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.gptq import GPTQConfig
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinConfig)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, VideoItem)
+from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
+                                   MultiModalDataParser)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.platforms import _Backend
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import uses_mrope
+
+from ..layers.activation import SiluAndMul
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
+from .qwen2_vl import _qwen2vl_field_config, apply_rotary_pos_emb_vision
+from .utils import (AutoWeightsLoader, WeightsMapper,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+from .vision import get_vit_attn_backend
+
+logger = init_logger(__name__)
+
+# For profile run
+_MAX_FRAMES_PER_VIDEO = 600
+
+# === Vision Inputs === #
+
+
+class Glm4vImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+    """Shape:
+    `(num_patches, num_channels * patch_size * patch_size)`
+    """
+
+    image_grid_thw: torch.Tensor
+    """Shape: `(num_images, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+class Glm4vImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    image_embeds: torch.Tensor
+    """Supported types:
+    - List[`torch.Tensor`]: A list of tensors holding all images' features.
+        Each tensor holds an image's features.
+    - `torch.Tensor`: A tensor holding all images' features
+        (concatenation of all images' feature tensors).
+
+    Tensor shape: `(num_image_features, hidden_size)`
+    - `num_image_features` varies based on
+        the number and resolution of the images.
+    - `hidden_size` must match the hidden size of language model backbone.
+    """
+
+    image_grid_thw: torch.Tensor
+    """Shape: `(num_images, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+Glm4vImageInputs = Union[Glm4vImagePixelInputs, Glm4vImageEmbeddingInputs]
+
+
+class Glm4vVideoPixelInputs(TypedDict):
+    type: Literal["pixel_values_videos"]
+    pixel_values_videos: torch.Tensor
+    """Shape:
+    `(num_patches,
+      num_channels * temporal_patch_size * patch_size * patch_size)`
+    """
+    # video_metadata: Union[list[VideoMetadata], list[dict]]
+    video_grid_thw: Union[list[torch.Tensor], torch.Tensor]
+    """Shape: `(num_videos, num_frames, 3)` or `(1, num_frames, 3)` 
+    for single video.
+    Each entry represents [grid_t, grid_h, grid_w] format where:
+    - grid_t: Temporal grid size (usually 1 for processed video)
+    - grid_h: Height grid size  
+    - grid_w: Width grid size
+    This describes the grid structure of the video patches.
+    """
+
+
+class Glm4vVideoEmbeddingInputs(TypedDict):
+    type: Literal["video_embeds"]
+
+    video_embeds: torch.Tensor
+    """
+    Tensor shape: `(num_video_patches, hidden_size)`
+    - `num_video_patches`: Total number of video patches across all frames
+    - `hidden_size`: Must match the hidden size of language model backbone
+    """
+
+    video_grid_thw: torch.Tensor
+    """Shape: `(num_videos, 1, 3)` or `(1, 1, 3)` for single video
+    Each entry represents [grid_t, grid_h, grid_w] format where:
+    - grid_t: Temporal grid size (usually 1 for processed video)
+    - grid_h: Height grid size  
+    - grid_w: Width grid size
+    This describes the grid structure of the video patches.
+    """
+
+
+Glm4vVideoInputs = Union[Glm4vVideoPixelInputs, Glm4vVideoEmbeddingInputs]
+
+# === Vision Encoder === #
+
+
+class Glm4vVisionMLP(nn.Module):
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int,
+        bias: bool = False,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=in_features,
+            output_sizes=[hidden_features] * 2,
+            bias=bias,
+            quant_config=quant_config,
+        )
+        self.down_proj = RowParallelLinear(
+            hidden_features,
+            in_features,
+            bias=bias,
+            quant_config=quant_config,
+        )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x: torch.Tensor):
+        x, _ = self.gate_up_proj(x)
+        x = self.act_fn(x)
+        x, _ = self.down_proj(x)
+        return x
+
+
+def all_gather_interleave(local_tensor, hidden_size: int, tp_size: int):
+    """All-gather the input tensor interleavely across model parallel group."""
+    import torch.distributed as dist
+
+    gathered_tensors = [torch.zeros_like(local_tensor) for _ in range(tp_size)]
+    dist.all_gather(
+        gathered_tensors,
+        local_tensor,
+        group=parallel_state.get_tp_group().device_group,
+    )
+
+    gathered_tensors_split = [
+        torch.split(tensor, hidden_size // tp_size, -1)
+        for tensor in gathered_tensors
+    ]
+    ordered_tensors = [
+        tensor for pair in zip(*gathered_tensors_split) for tensor in pair
+    ]
+    result_tensor = torch.cat(ordered_tensors, dim=-1)
+    return result_tensor
+
+
+class Glm4vVisionAttention(nn.Module):
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        projection_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        # Per attention head and per partition values.
+        self.tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        self.hidden_size_per_attention_head = dist_utils.divide(
+            projection_size, num_heads)
+        self.num_attention_heads_per_partition = dist_utils.divide(
+            num_heads, self.tp_size)
+
+        self.qkv = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.hidden_size_per_attention_head,
+            total_num_heads=num_heads,
+            total_num_kv_heads=num_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv",
+        )
+        self.proj = RowParallelLinear(
+            input_size=projection_size,
+            output_size=embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.proj",
+            bias=False,
+        )
+
+        # Detect attention implementation.
+        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+        if self.attn_backend not in {
+                _Backend.FLASH_ATTN,
+                _Backend.TORCH_SDPA,
+                _Backend.XFORMERS,
+        }:
+            raise RuntimeError(
+                f"GLM-4V does not support {self.attn_backend} backend now.")
+
+    def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
+        # [s, b, 3 * head * head_dim]
+        seq_len, bs, _ = qkv.shape
+        if self.tp_size > 1:
+            qkv = all_gather_interleave(qkv, self.qkv.hidden_size,
+                                        self.tp_size)
+
+        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
+        q, k, v = qkv.chunk(3, dim=2)
+
+        # 3 * [s, b, head * head_dim]
+        if self.tp_size > 1:
+            splitter = partial(
+                dist_utils.split_tensor_along_last_dim,
+                num_partitions=self.tp_size,
+            )
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+            v = splitter(v)[self.tp_rank]
+
+        # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim]
+        new_shape = (
+            seq_len,
+            bs,
+            self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+        )
+        q, k, v = (x.view(*new_shape) for x in (q, k, v))
+        return q, k, v
+
+    def forward(
+            self,
+            x: torch.Tensor,
+            cu_seqlens: torch.Tensor,
+            rotary_pos_emb: torch.Tensor,
+            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
+            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+    ) -> torch.Tensor:
+        # [s, b, c] --> [s, b, head * 3 * head_dim]
+        x, _ = self.qkv(x)
+
+        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim]
+        q, k, v = self.split_qkv(x)
+        batch_size = q.shape[1]
+
+        q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous()
+                   for x in (q, k, v))
+        if rotary_pos_emb is not None:
+            q = apply_rotary_pos_emb_vision(q, rotary_pos_emb)
+            k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
+
+        if self.attn_backend == _Backend.FLASH_ATTN:
+            # from vllm_flash_attn.flash_attn_interface import (
+            #   flash_attn_varlen_func)
+            from flash_attn import flash_attn_varlen_func
+
+            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
+
+            output = flash_attn_varlen_func(
+                q,
+                k,
+                v,
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_k=cu_seqlens,
+                max_seqlen_q=max_seqlen,
+                max_seqlen_k=max_seqlen,
+                dropout_p=0,
+                causal=False,
+            )
+
+            context_layer = rearrange(output,
+                                      "(b s) ... -> b s ...",
+                                      b=batch_size)
+        elif self.attn_backend == _Backend.TORCH_SDPA:
+            # Execute attention entry by entry for speed & less VRAM.
+            outputs = []
+            for i in range(1, len(cu_seqlens)):
+                start_idx = cu_seqlens[i - 1]
+                end_idx = cu_seqlens[i]
+                q_i = q[:, start_idx:end_idx]
+                k_i = k[:, start_idx:end_idx]
+                v_i = v[:, start_idx:end_idx]
+                q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
+                                 for x in [q_i, k_i, v_i])
+                output_i = F.scaled_dot_product_attention(q_i,
+                                                          k_i,
+                                                          v_i,
+                                                          dropout_p=0.0)
+                output_i = rearrange(output_i, "b h s d -> b s h d ")
+                outputs.append(output_i)
+            context_layer = torch.cat(outputs, dim=1)
+        elif self.attn_backend == _Backend.XFORMERS:
+            from xformers import ops as xops
+            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+
+            attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
+                                                       kv_seqlen=None,
+                                                       device=q.device)
+
+            context_layer = xops.memory_efficient_attention_forward(
+                q, k, v, attn_bias=attn_bias, p=0, scale=None)
+
+        context_layer = rearrange(context_layer,
+                                  "b s h d -> s b (h d)").contiguous()
+
+        output, _ = self.proj(context_layer)
+        return output
+
+
+class Glm4vVisionBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_hidden_dim: int,
+        norm_layer: Optional[Callable[[int], nn.Module]] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.norm1 = norm_layer(dim)
+        self.norm2 = norm_layer(dim)
+        self.attn = Glm4vVisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            projection_size=dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+        self.mlp = Glm4vVisionMLP(
+            dim,
+            mlp_hidden_dim,
+            bias=False,
+            quant_config=quant_config,
+        )
+
+    def forward(
+            self,
+            x: torch.Tensor,
+            cu_seqlens: torch.Tensor,
+            rotary_pos_emb: torch.Tensor,
+            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
+            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+    ) -> torch.Tensor:
+        x = x + self.attn(
+            self.norm1(x),
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb=rotary_pos_emb,
+            max_seqlen=max_seqlen,
+            seqlens=seqlens,
+        )
+
+        x = x + self.mlp(self.norm2(x))
+        return x
+
+
+class Glm4vVisionPatchEmbed(nn.Module):
+
+    def __init__(
+        self,
+        patch_size: int = 14,
+        temporal_patch_size: int = 1,
+        in_channels: int = 3,
+        hidden_size: int = 1536,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.hidden_size = hidden_size
+
+        kernel_size = (temporal_patch_size, patch_size, patch_size)
+        self.proj = nn.Conv3d(
+            in_channels,
+            hidden_size,
+            kernel_size=kernel_size,
+            stride=kernel_size,
+            bias=True,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        L, C = x.shape
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size,
+                   self.patch_size)
+        x = self.proj(x).view(L, self.hidden_size)
+        return x
+
+
+class Glm4vPatchMerger(nn.Module):
+
+    def __init__(
+        self,
+        d_model: int,
+        context_dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = d_model
+        self.proj = ColumnParallelLinear(self.hidden_size,
+                                         self.hidden_size,
+                                         bias=bias,
+                                         gather_output=True)
+        self.post_projection_norm = nn.LayerNorm(self.hidden_size)
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=self.hidden_size,
+            output_sizes=[context_dim] * 2,
+            bias=bias,
+            quant_config=quant_config,
+        )
+        self.down_proj = RowParallelLinear(
+            context_dim,
+            self.hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+        )
+        self.act_fn = SiluAndMul()
+        self.extra_activation_func = nn.GELU()
+
+    def forward(self, x: torch.Tensor):
+        x, _ = self.proj(x)
+        x = self.extra_activation_func(self.post_projection_norm(x))
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Glm4vVisionEmbeddings(nn.Module):
+
+    def __init__(self, config: Glm4vVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.num_patches = (self.image_size // self.patch_size)**2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions,
+                                               self.embed_dim)
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_positions).expand((1, -1)),
+            persistent=False,
+        )
+
+    def forward(self, embeddings, lengths, image_shapes, h_coords,
+                w_coords) -> torch.Tensor:
+        pos_embed_weight = self.position_embedding.weight
+        hidden_size = pos_embed_weight.shape[1]
+        total_seq = h_coords.shape[0]
+        device = pos_embed_weight.device
+
+        # Move coordinates to correct device
+        h_coords, w_coords = h_coords.to(device), w_coords.to(device)
+
+        # Handle empty sequence case
+        if total_seq == 0:
+            adapted_pos_embed = torch.empty(0,
+                                            hidden_size,
+                                            device=device,
+                                            dtype=pos_embed_weight.dtype)
+        else:
+            # Convert inputs to tensors if needed
+            if isinstance(lengths, list):
+                lengths = torch.tensor(lengths,
+                                       device=device,
+                                       dtype=torch.long)
+            if not isinstance(image_shapes, torch.Tensor):
+                image_shapes = torch.tensor(image_shapes,
+                                            device=device,
+                                            dtype=torch.long)
+
+            # Prepare 2D position embedding
+            orig_size_sq = pos_embed_weight.shape[0]
+            orig_size = int(orig_size_sq**0.5)
+            pos_embed_2d = (pos_embed_weight.view(
+                orig_size, orig_size,
+                hidden_size).permute(2, 0,
+                                     1).unsqueeze(0).to(device=device,
+                                                        dtype=torch.float32))
+
+            # Calculate target dimensions for each patch
+            target_h = torch.cat([
+                image_shapes[i, 1].repeat(lengths[i])
+                for i in range(len(lengths))
+            ]).to(device=device, dtype=torch.float32)
+            target_w = torch.cat([
+                image_shapes[i, 2].repeat(lengths[i])
+                for i in range(len(lengths))
+            ]).to(device=device, dtype=torch.float32)
+
+            # Normalize coordinates to [-1, 1] range for grid_sample
+            h_coords = h_coords.to(device=device, dtype=torch.float32)
+            w_coords = w_coords.to(device=device, dtype=torch.float32)
+            norm_w = ((w_coords + 0.5) / target_w) * 2 - 1
+            norm_h = ((h_coords + 0.5) / target_h) * 2 - 1
+
+            # Create sampling grid
+            grid = (torch.stack((norm_w, norm_h),
+                                dim=-1).unsqueeze(0).unsqueeze(2))
+
+            # Perform bicubic interpolation
+            interpolated_embed_fp32 = F.grid_sample(
+                pos_embed_2d,
+                grid,
+                mode="bicubic",
+                align_corners=False,
+                padding_mode="border",
+            )
+
+            # Reshape and convert back to original dtype
+            adapted_pos_embed_fp32 = (
+                interpolated_embed_fp32.squeeze(0).squeeze(-1).permute(1, 0))
+            adapted_pos_embed = adapted_pos_embed_fp32.to(
+                pos_embed_weight.dtype).to(embeddings.device)
+
+        # Add adapted position encoding to embeddings
+        embeddings = embeddings + adapted_pos_embed
+        return embeddings
+
+
+class Glm4vVisionRotaryEmbedding(nn.Module):
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        inv_freq = 1.0 / (theta
+                          **(torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self._seq_len_cached = 0
+        self._freqs_cached = None
+
+    def update_freqs_cache(self, seqlen: int) -> None:
+        if seqlen > self._seq_len_cached:
+            seqlen *= 2
+            self._seq_len_cached = seqlen
+            self.inv_freq = 1.0 / (self.theta**(torch.arange(
+                0,
+                self.dim,
+                2,
+                dtype=torch.float,
+                device=self.inv_freq.device,
+            ) / self.dim))
+            seq = torch.arange(seqlen,
+                               device=self.inv_freq.device,
+                               dtype=self.inv_freq.dtype)
+            freqs = torch.outer(seq, self.inv_freq)
+            self._freqs_cached = freqs
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        self.update_freqs_cache(seqlen)
+        return self._freqs_cached[:seqlen]
+
+
+class Glm4vVisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        vision_config: Glm4vVisionConfig,
+        norm_eps: float = 1e-6,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        patch_size = vision_config.patch_size
+        temporal_patch_size = vision_config.temporal_patch_size
+        in_channels = vision_config.in_channels
+        depth = vision_config.depth
+        self.hidden_size = vision_config.hidden_size
+        self.num_heads = vision_config.num_heads
+
+        self.patch_size = vision_config.patch_size
+        self.spatial_merge_size = vision_config.spatial_merge_size
+        self.out_hidden_size = vision_config.out_hidden_size
+
+        self.patch_embed = Glm4vVisionPatchEmbed(
+            patch_size=patch_size,
+            temporal_patch_size=temporal_patch_size,
+            in_channels=in_channels,
+            hidden_size=self.hidden_size,
+        )
+
+        norm_layer = partial(RMSNorm, eps=norm_eps)
+        head_dim = self.hidden_size // self.num_heads
+        self.rotary_pos_emb = Glm4vVisionRotaryEmbedding(head_dim // 2)
+        self.blocks = nn.ModuleList([
+            Glm4vVisionBlock(
+                dim=self.hidden_size,
+                num_heads=self.num_heads,
+                mlp_hidden_dim=vision_config.out_hidden_size,
+                norm_layer=norm_layer,
+                quant_config=quant_config,
+                prefix=f"{prefix}.blocks.{layer_idx}",
+            ) for layer_idx in range(depth)
+        ])
+        self.merger = Glm4vPatchMerger(
+            d_model=vision_config.out_hidden_size,
+            context_dim=vision_config.intermediate_size,
+            quant_config=quant_config,
+            bias=False,
+        )
+        self.embeddings = Glm4vVisionEmbeddings(vision_config)
+
+        self.post_conv_layernorm = RMSNorm(vision_config.hidden_size,
+                                           eps=vision_config.rms_norm_eps)
+        self.downsample = nn.Conv2d(
+            in_channels=vision_config.hidden_size,
+            out_channels=vision_config.out_hidden_size,
+            kernel_size=vision_config.spatial_merge_size,
+            stride=vision_config.spatial_merge_size,
+        )
+        self.post_layernorm = RMSNorm(vision_config.hidden_size,
+                                      eps=vision_config.rms_norm_eps)
+
+        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.patch_embed.proj.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.patch_embed.proj.weight.device
+
+    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            hpos_ids = (hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            ).permute(0, 2, 1, 3).flatten())
+            wpos_ids = (wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            ).permute(0, 2, 1, 3).flatten())
+            pos_ids.append(
+                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb, pos_ids
+
+    def compute_attn_mask_seqlen(
+        self,
+        cu_seqlens: torch.Tensor,
+    ) -> tuple[Optional[int], Optional[list[int]]]:
+        max_seqlen, seqlens = None, None
+        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        if self.attn_backend == _Backend.FLASH_ATTN:
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        return max_seqlen, seqlens
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        # patchify
+        x = x.to(device=self.device, dtype=self.dtype)
+        x = self.patch_embed(x)
+        x = self.post_conv_layernorm(x)
+
+        # compute position embedding
+        rotary_pos_emb, image_type_ids = self.rot_pos_emb(grid_thw)
+        # compute cu_seqlens
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
+                                             grid_thw[:, 0]).cumsum(
+                                                 dim=0, dtype=torch.int32)
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
+
+        # pre-compute seqlens for attn mask to reduce cuMemcpy operations
+        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        x = self.embeddings(x, seqlens, grid_thw, image_type_ids[:, 0],
+                            image_type_ids[:, 1])
+
+        # transformers
+        x = x.unsqueeze(1)
+        for blk in self.blocks:
+            x = blk(
+                x,
+                cu_seqlens=cu_seqlens,
+                rotary_pos_emb=rotary_pos_emb,
+                max_seqlen=max_seqlen,
+                seqlens=seqlens,
+            )
+
+        # adapter
+        x = self.post_layernorm(x)
+
+        x = x.view(-1, self.spatial_merge_size, self.spatial_merge_size,
+                   x.shape[-1])
+        x = x.permute(0, 3, 1, 2)
+        x = self.downsample(x).view(-1, self.out_hidden_size)
+        x = self.merger(x)
+
+        return x
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("attn.qkv.", "attn.q.", "q"),
+            ("attn.qkv.", "attn.k.", "k"),
+            ("attn.qkv.", "attn.v.", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Glm4vProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(Glm4vConfig)
+
+    def get_tokenizer(self):
+        return self.ctx.tokenizer
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": 1}
+
+    def get_image_processor(self) -> Glm4vImageProcessor:
+        return self.get_hf_processor().image_processor
+
+    def get_video_processor(self) -> Glm4vVideoProcessor:
+        return self.get_hf_processor().video_processor
+
+    def _get_vision_info(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int = 16,
+        do_resize: bool = True,
+        max_image_pixels: int = 28 * 28 * 2 * 30000,
+    ) -> tuple[ImageSize, int]:
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+        patch_size = vision_config.patch_size
+        merge_size = vision_config.spatial_merge_size
+        temporal_patch_size = vision_config.temporal_patch_size
+        if do_resize:
+            resized_height, resized_width = smart_resize(
+                num_frames=num_frames
+                if num_frames > temporal_patch_size else temporal_patch_size,
+                height=image_height,
+                width=image_width,
+                factor=patch_size * merge_size,
+                max_pixels=max_image_pixels,
+            )
+            preprocessed_size = ImageSize(width=resized_width,
+                                          height=resized_height)
+        else:
+            preprocessed_size = ImageSize(width=image_width,
+                                          height=image_height)
+
+        # NOTE: Frames are padded to be divisible by `temporal_patch_size`
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py#L294
+        padded_num_frames = num_frames + num_frames % temporal_patch_size
+
+        grid_t = max(padded_num_frames // temporal_patch_size, 1)
+        grid_h = preprocessed_size.height // patch_size
+        grid_w = preprocessed_size.width // patch_size
+
+        num_patches = grid_t * grid_h * grid_w
+        num_vision_tokens = num_patches // (merge_size**2)
+
+        return preprocessed_size, num_vision_tokens
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        max_image_size, _ = self._get_vision_info(image_width=9999999,
+                                                  image_height=9999999)
+        return max_image_size
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        _, num_image_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+            max_image_pixels=28 * 28 * 2 * 6144,
+        )
+        return num_image_tokens
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+        )
+
+    def get_num_video_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int,
+    ) -> int:
+        _, num_video_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+            num_frames=num_frames,
+            max_image_pixels=28 * 28 * 2 * 30000,
+        )
+        return num_video_tokens
+
+    def _get_max_video_frames(self, max_tokens: int) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        num_frames = 0
+
+        while True:
+            next_num_frames = num_frames + 1
+            next_max_tokens = self.get_num_video_tokens(
+                image_width=target_width,
+                image_height=target_height,
+                num_frames=next_num_frames,
+            )
+            if next_max_tokens > max_tokens or next_max_tokens == 0:
+                break
+
+            num_frames = next_num_frames
+
+        return num_frames
+
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
+
+        max_image_tokens = self.get_max_image_tokens() * max_images
+        max_total_frames = self._get_max_video_frames(seq_len -
+                                                      max_image_tokens)
+        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
+                                   _MAX_FRAMES_PER_VIDEO)
+
+        return max(max_frames_per_video, 1)
+
+    def _get_video_second_idx(self, metadata: dict[str, Any],
+                              total_frames: int) -> list[int]:
+        video_processor = self.get_video_processor()
+
+        video_fps = metadata.get("fps", 2.0)
+        meta_frames = metadata.get("total_num_frames", total_frames)
+        max_frame_idx = meta_frames - 1
+        duration = metadata.get("duration",
+                                round(max_frame_idx / video_fps) + 1)
+        if duration <= video_processor.max_duration:
+            n = int(math.floor(duration * video_processor.fps))
+            frame_indices = [
+                min(
+                    max_frame_idx,
+                    int(math.ceil(i * video_fps / video_processor.fps)),
+                ) for i in range(n)
+            ]
+        else:
+            num_samples = int(video_processor.max_duration *
+                              video_processor.fps)
+            if num_samples >= meta_frames:
+                frame_indices = list(range(meta_frames))
+            else:
+                target_seconds = np.linspace(0,
+                                             duration,
+                                             num_samples,
+                                             endpoint=True)
+                frame_indices = [
+                    min(max_frame_idx, int(math.ceil(t * video_fps)))
+                    for t in target_seconds
+                ]
+
+        seen, uniq = set(), []
+        for idx in frame_indices:
+            if idx not in seen:
+                seen.add(idx)
+                uniq.append(idx)
+        if len(uniq) & 1:
+            uniq.append(uniq[-1])
+        frame_indices = uniq
+
+        full_second_idxs = [int(idx / video_fps) for idx in frame_indices]
+        timestamps_list = full_second_idxs[::2]
+        selected_timestamps = []
+        for idx in range(0, len(timestamps_list)):
+            selected_timestamps.append(timestamps_list[idx])
+        return selected_timestamps
+
+
+class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        hf_config = self.info.get_hf_config()
+        hf_processor = self.info.get_hf_processor()
+        tokenizer = self.info.get_tokenizer()
+
+        image_token: str = hf_processor.image_token
+        video_token_ids = [
+            hf_config.video_start_token_id,
+            hf_processor.video_token_id,
+            hf_config.video_end_token_id,
+        ]
+        video_token = tokenizer.decode(video_token_ids)
+
+        return image_token * num_images + video_token * num_videos
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        target_width, target_height = (
+            self.info.get_image_size_with_most_features())
+        target_num_frames = self.info.get_num_frames_with_most_features(
+            seq_len, mm_counts)
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "video":
+            self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=target_num_frames,
+                num_videos=num_videos,
+            ),
+        }
+
+    def _get_dummy_videos(
+        self,
+        *,
+        width: int,
+        height: int,
+        num_frames: int,
+        num_videos: int,
+    ) -> list[VideoItem]:
+        video = np.full((num_frames, width, height, 3), 255, dtype=np.uint8)
+        video_items = []
+        for i in range(num_videos):
+            video_metadata = {
+                "fps": 2.0,
+                "duration": num_frames / 2.0,
+                "total_num_frames": num_frames,
+                "video_backend": "opencv",
+            }
+            video_item = (video.copy(), video_metadata)
+            video_items.append(video_item)
+
+        return video_items
+
+
+class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return MultiModalDataParser(video_needs_metadata=True)
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        mm_data = dict(mm_data)
+        processor = self.info.get_hf_processor(**mm_kwargs)
+
+        # GLM-4.1V use `image_token_id` as video placeholder, we need to
+        # replace it with `video_token_id` for video processing. So we
+        # separate video processing from image processing.
+        if ("videos" in mm_data and isinstance(mm_data["videos"], list)
+                and len(mm_data["videos"]) > 0):
+            video_grid_thw_lst = []
+            pixel_values_videos_lst = []
+            for item in mm_data.pop("videos", []):
+                video_array, metadata = item
+
+                # FIXME(Isotr0py): Activate the below logic after we can disable
+                # resampling from video loader backend.
+                # assert metadata["total_num_frames"] == len(video_array), (
+                #     f"Total frames {metadata['total_num_frames']} does not "
+                #     f"match the length of video array {len(video_array)}.")
+
+                # NOTE: Temporary workaround for resampled videos.
+                # this can cause a divergence with HF implementation if
+                # the input video is resampled in advance.
+
+                if metadata["total_num_frames"] != len(video_array):
+                    logger.warning(
+                        "Total frames in metadata "
+                        "(%s) does not match the length of "
+                        "video array %s. This can "
+                        "be because the video is resampled "
+                        "in advance. This may cause "
+                        "a divergence with HF implementation.",
+                        metadata["total_num_frames"],
+                        len(video_array),
+                    )
+                    metadata["total_num_frames"] = len(video_array)
+                metadata = VideoMetadata(**metadata)
+
+                video_mm_data = dict()
+                video_mm_data["videos"] = [[video_array]]
+                video_mm_data["video_metadata"] = [[metadata]]
+
+                video_outputs = super()._call_hf_processor(
+                    prompt="<|begin_of_video|><|video|><|end_of_video|>",
+                    mm_data=video_mm_data,
+                    mm_kwargs=mm_kwargs,
+                    tok_kwargs=tok_kwargs,
+                )
+                input_ids = video_outputs.pop("input_ids")
+                input_ids[input_ids == processor.image_token_id] = (
+                    processor.video_token_id)
+                video_placeholder = processor.tokenizer.batch_decode(
+                    input_ids)[0]
+                prompt = prompt.replace(
+                    "<|begin_of_video|><|video|><|end_of_video|>",
+                    video_placeholder,
+                )
+
+                grid_t = len(video_outputs["video_grid_thw"])
+                _, grid_h, grid_w = video_outputs["video_grid_thw"][0]
+                grid_thw = torch.tensor([[grid_t, grid_h, grid_w]])
+
+                video_grid_thw_lst.append(grid_thw)
+                pixel_values_videos_lst.append(
+                    video_outputs["pixel_values_videos"])
+            video_outputs = dict(
+                pixel_values_videos=torch.cat(pixel_values_videos_lst),
+                video_grid_thw=torch.cat(video_grid_thw_lst),
+            )
+        else:
+            video_outputs = dict()
+
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+        combined_outputs = dict(
+            processed_outputs,
+            **video_outputs,
+        )
+        return BatchFeature(combined_outputs)
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _qwen2vl_field_config(hf_inputs)
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_processor = self.info.get_image_processor(
+            **hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        hf_config = self.info.get_hf_config()
+
+        boi_token_id = hf_config.image_start_token_id
+        eoi_token_id = hf_config.image_end_token_id
+
+        bov_token_id = hf_config.video_start_token_id
+        eov_token_id = hf_config.video_end_token_id
+
+        merge_length = image_processor.merge_size**2
+
+        def get_image_replacement_glm4v(item_idx: int):
+            grid_thw = out_mm_kwargs["image_grid_thw"][item_idx]
+            assert isinstance(grid_thw, torch.Tensor)
+
+            num_tokens = int(grid_thw.prod()) // merge_length
+            return [hf_processor.image_token_id] * num_tokens
+
+        def get_video_replacement_glm4v(item_idx: int):
+            grid_thw = out_mm_kwargs["video_grid_thw"][item_idx]
+            assert isinstance(grid_thw, torch.Tensor)
+
+            video, metadata = mm_items["video"][item_idx]
+            timestamps = self.info._get_video_second_idx(metadata, len(video))
+            frames_idx_token = [
+                tokenizer.encode(str(i), add_special_tokens=False)
+                for i in timestamps
+            ]
+            num_tokens_per_frame = int(grid_thw[1:].prod()) // merge_length
+            placeholder = []
+            placeholder.append(bov_token_id)
+            for frame_idx in frames_idx_token:
+                placeholder.append(boi_token_id)
+                placeholder.extend([hf_processor.video_token_id] *
+                                   num_tokens_per_frame)
+                placeholder.append(eoi_token_id)
+                placeholder.extend(frame_idx)
+            placeholder.append(eov_token_id)
+            return placeholder
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=hf_processor.image_token,
+                replacement=get_image_replacement_glm4v,
+            ),
+            PromptReplacement(
+                modality="video",
+                target="<|begin_of_video|><|video|><|end_of_video|>",
+                replacement=get_video_replacement_glm4v,
+            ),
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Glm4vMultiModalProcessor,
+    info=Glm4vProcessingInfo,
+    dummy_inputs=Glm4vDummyInputsBuilder,
+)
+class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                    SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # To ensure correct weight loading and mapping.
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "lm_head.": "language_model.lm_head.",
+            "model.language_model.": "language_model.model.",
+            "model.visual.": "visual.",
+        })
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: Glm4vConfig = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.visual = Glm4vVisionTransformer(
+            config.vision_config,
+            norm_eps=getattr(config, "rms_norm_eps", 1e-5),
+            quant_config=self._maybe_ignore_quant_config(quant_config),
+            prefix=maybe_prefix(prefix, "visual"),
+        )
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, ""),
+            architectures=["Glm4ForCausalLM"],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
+        # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
+        # seems to avoid vision encoder sections for some models.
+        if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
+            return None
+        return quant_config
+
+    def _validate_and_reshape_mm_tensor(self, mm_input: object,
+                                        name: str) -> torch.Tensor:
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(
+                f"Incorrect type of {name}. Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            if mm_input.ndim == 2:
+                return mm_input
+            if mm_input.ndim != 3:
+                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
+                                 f"Got ndim: {mm_input.ndim} "
+                                 f"(shape={mm_input.shape})")
+            return torch.concat(list(mm_input))
+        else:
+            return torch.concat(mm_input)
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Glm4vImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            pixel_values = self._validate_and_reshape_mm_tensor(
+                pixel_values, "image pixel values")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return Glm4vImagePixelInputs(
+                type="pixel_values",
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_thw,
+            )
+
+        if image_embeds is not None:
+            image_embeds = self._validate_and_reshape_mm_tensor(
+                image_embeds, "image embeds")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(image_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+            return Glm4vImageEmbeddingInputs(
+                type="image_embeds",
+                image_embeds=image_embeds,
+                image_grid_thw=image_grid_thw,
+            )
+
+    def _parse_and_validate_video_input(
+            self, **kwargs: object) -> Optional[Glm4vVideoInputs]:
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+        video_embeds = kwargs.pop("video_embeds", None)
+        video_grid_thw = kwargs.pop("video_grid_thw", None)
+        if pixel_values_videos is None and video_embeds is None:
+            return None
+        if pixel_values_videos is not None:
+            pixel_values_videos = self._validate_and_reshape_mm_tensor(
+                pixel_values_videos, "video pixel values")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            return Glm4vVideoPixelInputs(
+                type="pixel_values_videos",
+                # video_metadata=video_metadata,
+                pixel_values_videos=pixel_values_videos,
+                video_grid_thw=video_grid_thw,
+            )
+
+        if video_embeds is not None:
+            video_embeds = self._validate_and_reshape_mm_tensor(
+                video_embeds, "video embeds")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            if not isinstance(video_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of video embeddings. "
+                                 f"Got type: {type(video_embeds)}")
+            return Glm4vVideoEmbeddingInputs(
+                type="video_embeds",
+                video_embeds=video_embeds,
+                video_grid_thw=video_grid_thw,
+            )
+
+    def _process_image_input(
+            self, image_input: Glm4vImageInputs) -> tuple[torch.Tensor, ...]:
+        grid_thw = image_input["image_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        if image_input["type"] == "image_embeds":
+            image_embeds = image_input["image_embeds"].type(self.visual.dtype)
+        else:
+            pixel_values = image_input["pixel_values"].type(self.visual.dtype)
+            image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
+
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+        return image_embeds.split(sizes.tolist())
+
+    def _process_video_input(
+            self, video_input: Glm4vVideoInputs) -> tuple[torch.Tensor, ...]:
+        grid_thw = video_input["video_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        device = self.visual.device
+        flat_grid_thw = torch.cat([
+            torch.tensor([[1, h, w]] * t, device=device)
+            for t, h, w in grid_thw
+        ])
+        if video_input["type"] == "video_embeds":
+            video_embeds = video_input["video_embeds"].type(self.visual.dtype)
+        else:
+            pixel_values_videos = video_input["pixel_values_videos"].type(
+                self.visual.dtype)
+            video_embeds = self.visual(pixel_values_videos,
+                                       grid_thw=flat_grid_thw)
+
+        # Split concatenated embeddings for each video item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+
+        return video_embeds.split(sizes.tolist())
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        mm_input_by_modality = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if (input_key in ("pixel_values", "image_embeds")
+                    and "image" not in mm_input_by_modality):
+                mm_input_by_modality["image"] = (
+                    self._parse_and_validate_image_input(**kwargs))
+            if (input_key in ("pixel_values_videos", "video_embeds")
+                    and "video" not in mm_input_by_modality):
+                mm_input_by_modality["video"] = (
+                    self._parse_and_validate_video_input(**kwargs))
+        return mm_input_by_modality
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
+            **kwargs)
+        if not mm_input_by_modality:
+            return None
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in mm_input_by_modality:
+            multimodal_input = mm_input_by_modality[modality]
+            if modality == "image":
+                vision_embeddings = self._process_image_input(multimodal_input)
+                multimodal_embeddings += vision_embeddings
+            if modality == "video":
+                video_embeddings = self._process_video_input(multimodal_input)
+                multimodal_embeddings += video_embeddings
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if (multimodal_embeddings is not None
+                and len(multimodal_embeddings) != 0
+                and all(embed.numel() > 0 for embed in multimodal_embeddings)):
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                [self.config.image_token_id, self.config.video_token_id],
+            )
+        return inputs_embeds
+
+    def get_input_embeddings_v0(
+        self,
+        input_ids: torch.Tensor,
+        image_input: Optional[Glm4vImageInputs] = None,
+        video_input: Optional[Glm4vVideoInputs] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.get_input_embeddings(input_ids)
+        if image_input is not None:
+            image_embeds = self._process_image_input(image_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                image_embeds,
+                placeholder_token_id=self.config.image_token_id,
+            )
+
+        if video_input is not None:
+            video_embeds = self._process_video_input(video_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                video_embeds,
+                placeholder_token_id=self.config.video_token_id,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """Run forward pass for GLM-4V.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            positions: Flattened (concatenated) position ids corresponding to a
+                batch.
+                **NOTE**: If mrope is enabled (default setting for GLM-4V
+                opensource models), the shape will be `(3, seq_len)`,
+                otherwise it will be `(seq_len,).
+            pixel_values: Pixel values to be fed to a model.
+                `None` if no images are passed.
+            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
+                `None` if no images are passed.
+            pixel_values_videos: Pixel values of videos to be fed to a model.
+                `None` if no videos are passed.
+            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
+                `None` if no videos are passed.
+            second_per_grid_ts: Tensor `(num_videos)` of video time interval (
+                in seconds) for each grid along the temporal dimension in the
+                3D position IDs. `None` if no videos are passed.
+        """
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner from
+        # `get_multimodal_embeddings` and `get_input_embeddings`, this
+        # condition is only for v0 compatibility.
+        elif inputs_embeds is None:
+            image_input = self._parse_and_validate_image_input(**kwargs)
+            video_input = self._parse_and_validate_video_input(**kwargs)
+
+            if image_input is None and video_input is None:
+                inputs_embeds = None
+            else:
+                if uses_mrope(self.config):
+                    assert positions.ndim == 2 and positions.size(0) == 3, (
+                        "multimodal section rotary embedding requires "
+                        f"(3, seq_len) positions, but got {positions.size()}")
+                inputs_embeds = self.get_input_embeddings_v0(
+                    input_ids,
+                    image_input=image_input,
+                    video_input=video_input)
+                input_ids = None
+
+        hidden_states = self.language_model.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="visual.merger.",
+            tower_model="visual.",
+        )
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index d566146662b8d..65a1676672393 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -190,6 +190,7 @@ _MULTIMODAL_MODELS = {
     "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
     "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"),  # noqa: E501
     "GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"),
+    "Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"),  # noqa: E501
     "GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"),  # noqa: E501
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 5cb720381d94b..18aae35c6fd42 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -57,10 +57,12 @@ which are treated as image embeddings;
 these are directly passed to the model without HF processing.
 """
 
-VideoItem: TypeAlias = Union[HfVideoItem, "torch.Tensor"]
+VideoItem: TypeAlias = Union[HfVideoItem, "torch.Tensor",
+                             tuple[HfVideoItem, dict[str, Any]]]
 """
-A `transformers.image_utils.VideoInput` representing a single video
-item, which can be passed to a HuggingFace `VideoProcessor`.
+A `transformers.video_utils.VideoInput` representing a single video item. 
+This can be passed to a HuggingFace `VideoProcessor` 
+with `transformers.video_utils.VideoMetadata`.
 
 Alternatively, a 3-D tensor or batch of 2-D tensors,
 which are treated as video embeddings;
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index cae62b2235e40..37f561274272b 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -224,8 +224,14 @@ class ImageEmbeddingItems(EmbeddingItems):
 
 class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):
 
-    def __init__(self, data: Sequence[HfVideoItem]) -> None:
+    def __init__(
+        self,
+        data: Sequence[HfVideoItem],
+        metadata: Optional[Union[dict[str, Any],
+                                 list[Optional[dict[str, Any]]]]] = None,
+    ) -> None:
         super().__init__(data, "video")
+        self.metadata = metadata
 
     def get_num_frames(self, item_idx: int) -> int:
         return len(self.get(item_idx))
@@ -320,6 +326,7 @@ class MultiModalDataParser:
         *,
         target_sr: Optional[float] = None,
         audio_resample_method: Literal["librosa", "scipy"] = "librosa",
+        video_needs_metadata: bool = False,
     ) -> None:
         super().__init__()
 
@@ -327,6 +334,7 @@ class MultiModalDataParser:
             target_sr=target_sr,
             method=audio_resample_method,
         )
+        self.video_needs_metadata = video_needs_metadata
 
     def _is_embeddings(
             self, data: object
@@ -361,6 +369,21 @@ class MultiModalDataParser:
 
         assert_never(audio)
 
+    def _get_video_with_metadata(
+        self,
+        video: VideoItem,
+    ) -> tuple[np.ndarray, Optional[dict[str, Any]]]:
+        if isinstance(video, tuple):
+            return video
+        if isinstance(video, list):
+            return np.array(video), None
+        if isinstance(video, np.ndarray):
+            return video, None
+        if isinstance(video, torch.Tensor):
+            return video.numpy(), None
+
+        assert_never(video)
+
     def _parse_audio_data(
         self,
         data: ModalityData[AudioItem],
@@ -433,10 +456,25 @@ class MultiModalDataParser:
             data_items = [data]
         elif isinstance(data, (np.ndarray, torch.Tensor)):
             data_items = [elem for elem in data]
+        elif isinstance(data, tuple) and len(data) == 2:
+            data_items = [data]
         else:
             data_items = data
 
-        return VideoProcessorItems(data_items)
+        new_videos = list[tuple[np.ndarray, Optional[dict[str, Any]]]]()
+        metadata_lst: list[Optional[dict[str, Any]]] = []
+        for data_item in data_items:
+            video, metadata = self._get_video_with_metadata(data_item)
+            if self.video_needs_metadata:
+                new_videos.append((video, metadata))
+                metadata_lst.append(metadata)
+            else:
+                new_videos.append(video)
+
+        if not self.video_needs_metadata:
+            metadata = None
+
+        return VideoProcessorItems(new_videos, metadata=metadata_lst)
 
     def _get_subparsers(self) -> Mapping[str, ModalityDataParser]:
         return {
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index bedb9536e3c9c..f15264cbde0b2 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -24,6 +24,7 @@ def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray:
                               dtype=frames.dtype)
     # lazy import cv2 to avoid bothering users who only use text models
     import cv2
+
     for i, frame in enumerate(frames):
         resized_frame = cv2.resize(frame, (new_width, new_height))
         resized_frames[i] = resized_frame
@@ -92,14 +93,16 @@ class OpenCVVideoBackend(VideoLoader):
                 continue
             if not vr.isBackendBuiltIn(backend):
                 _, abi, api = vr.getStreamBufferedBackendPluginVersion(backend)
-                if (abi < 1 or (abi == 1 and api < 2)):
+                if abi < 1 or (abi == 1 and api < 2):
                     continue
             api_pref = backend
             break
         return api_pref
 
     @classmethod
-    def load_bytes(cls, data: bytes, num_frames: int = -1) -> npt.NDArray:
+    def load_bytes(cls,
+                   data: bytes,
+                   num_frames: int = -1) -> tuple[npt.NDArray, dict]:
         import cv2
 
         backend = cls().get_cv2_video_api()
@@ -108,6 +111,9 @@ class OpenCVVideoBackend(VideoLoader):
             raise ValueError("Could not open video stream")
 
         total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        original_fps = cap.get(cv2.CAP_PROP_FPS)
+        duration = total_frames_num / original_fps if original_fps > 0 else 0
+
         full_read = num_frames == -1 or total_frames_num < num_frames
         if full_read:
             num_frames = total_frames_num
@@ -125,18 +131,27 @@ class OpenCVVideoBackend(VideoLoader):
 
         i = 0
         for idx in range(total_frames_num):
-            ok = cap.grab()  # next img
+            ok = cap.grab()
             if not ok:
                 break
-            if idx in frame_idx:  # only decompress needed
+            if idx in frame_idx:
                 ret, frame = cap.retrieve()
                 if ret:
                     frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                     i += 1
-        # we expect all frames loaded
+
         assert i == num_frames, (f"Expected reading {num_frames} frames, "
                                  f"but only loaded {i} frames from video.")
-        return frames
+
+        # Use transformers transformers.video_utils.VideoMetadata format
+        metadata = {
+            "total_num_frames": total_frames_num,
+            "fps": original_fps,
+            "duration": duration,
+            "video_backend": "opencv"
+        }
+
+        return frames, metadata
 
 
 class VideoMediaIO(MediaIO[npt.NDArray]):

From ecad851cbd0a2a6f9922fc9f3e94bde3f8220176 Mon Sep 17 00:00:00 2001
From: aiyiwang2025 <aiyiwang@tencent.com>
Date: Tue, 1 Jul 2025 22:28:13 +0800
Subject: [PATCH 019/167] [Model]Add Tencent HunYuanMoEV1 Model Support
 (#20114)

Signed-off-by: aiyiwang <aiyiwang@tencent.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: quinnrong <quinnrong@tencent.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 docs/models/supported_models.md               |   3 +-
 tests/models/registry.py                      |   4 +-
 tests/models/test_initialization.py           |   3 +-
 .../model_executor/layers/rotary_embedding.py |  47 +-
 vllm/model_executor/models/hunyuan_v1_moe.py  | 897 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 6 files changed, 949 insertions(+), 6 deletions(-)
 create mode 100644 vllm/model_executor/models/hunyuan_v1_moe.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index db650b37a38db..d2c22aef16f74 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -350,6 +350,7 @@ Specified using `--task generate`.
 | `GraniteMoeSharedForCausalLM`                     | Granite MoE Shared                                  | `ibm-research/moe-7b-1b-active-shared-experts` (test model)                                                                                                                  | ✅︎                     | ✅︎                          | ✅︎                     |
 | `GritLM`                                          | GritLM                                              | `parasail-ai/GritLM-7B-vllm`.                                                                                                                                                | ✅︎                     | ✅︎                          |                       |
 | `Grok1ModelForCausalLM`                           | Grok1                                               | `hpcai-tech/grok-1`.                                                                                                                                                         | ✅︎                     | ✅︎                          | ✅︎                     |
+| `HunYuanMoEV1ForCausalLM`                         | Hunyuan-80B-A13B                                    | `tencent/Hunyuan-A13B-Instruct`,  `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`etc.                                                                  |                        |                             | ✅︎                     |
 | `InternLMForCausalLM`                             | InternLM                                            | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.                                                                                                                    | ✅︎                     | ✅︎                          | ✅︎                     |
 | `InternLM2ForCausalLM`                            | InternLM2                                           | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.                                                                                                                  | ✅︎                     | ✅︎                          | ✅︎                     |
 | `InternLM3ForCausalLM`                            | InternLM3                                           | `internlm/internlm3-8b-instruct`, etc.                                                                                                                                       | ✅︎                     | ✅︎                          | ✅︎                     |
@@ -387,7 +388,7 @@ Specified using `--task generate`.
 | `TeleChat2ForCausalLM`                            | TeleChat2                                           | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc.                                                                                                | ✅︎                     | ✅︎                          | ✅︎                     |
 | `TeleFLMForCausalLM`                              | TeleFLM                                             | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc.                                                                                                                    | ✅︎                     | ✅︎                          | ✅︎                     |
 | `XverseForCausalLM`                               | XVERSE                                              | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.                                                                                            | ✅︎                     | ✅︎                          | ✅︎                     |
-| `MiniMaxM1ForCausalLM`                        | MiniMax-Text                                        | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`etc.                                                                                                                                  |                        |                             |                       |
+| `MiniMaxM1ForCausalLM`                        | MiniMax-Text                                        | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`etc.                                                                                                                       |                        |                             |                       |
 | `MiniMaxText01ForCausalLM`                        | MiniMax-Text                                        | `MiniMaxAI/MiniMax-Text-01`, etc.                                                                                                                                            |                        |                             |                       |
 | `Zamba2ForCausalLM`                               | Zamba2                                              | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc.                                                                              |                        |                             |                       |
 
diff --git a/tests/models/registry.py b/tests/models/registry.py
index affe2e88b2b94..faae46d286b3e 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -188,6 +188,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts"),  # noqa: E501
     "Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1",
                                              trust_remote_code=True),
+    "HunYuanMoEV1ForCausalLM": _HfExamplesInfo("tencent/Hunyuan-A13B-Instruct",
+                                               trust_remote_code=True),
     "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
                                            trust_remote_code=True),
     "InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",
@@ -490,4 +492,4 @@ class HfExampleModels:
         raise ValueError(f"No example model defined for {model_id}")
 
 
-HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
+HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
\ No newline at end of file
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index df72607767fdd..0ac684fdd30dc 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -33,7 +33,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
 
         # Ensure at least 2 expert per group
         # Since `grouped_topk` assums top-2
-        num_experts = getattr(text_config, 'n_group', 1) * 2
+        n_group = getattr(text_config, 'n_group', None)
+        num_experts = n_group * 2 if n_group is not None else 2
 
         text_config.update({
             "num_layers": 1,
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index d84dad1885a3e..12b204bb98405 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -533,6 +533,41 @@ class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
         return cache
 
 
+class DynamicNTKAlphaRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with Dynamic NTK alpha.
+
+    Based on the original RotaryEmbedding implementation.
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        scaling_alpha: float,
+        dtype: torch.dtype,
+    ) -> None:
+        self.scaling_alpha = scaling_alpha
+        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+                         is_neox_style, dtype)
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        # For Hunyuan DynamicNTKAlphaRotaryEmbedding
+        max_len = self.max_position_embeddings
+        base = self.base * self.scaling_alpha**(self.rotary_dim /
+                                                (self.rotary_dim - 2))
+        inv_freq = self._compute_inv_freq(base)
+        t = torch.arange(max_len, dtype=torch.float)
+
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+
 # Inverse dim formula to find dim based on number of rotations
 def _yarn_find_correction_dim(num_rotations: int,
                               dim: int,
@@ -1929,9 +1964,15 @@ def get_rope(
                                                    mixed_b)
         elif scaling_type == "dynamic":
             scaling_factor = rope_scaling["factor"]
-            rotary_emb = DynamicNTKScalingRotaryEmbedding(
-                head_size, rotary_dim, max_position, base, is_neox_style,
-                scaling_factor, dtype)
+            scaling_alpha = rope_scaling["alpha"]
+            if scaling_alpha:
+                rotary_emb = DynamicNTKAlphaRotaryEmbedding(
+                    head_size, rotary_dim, max_position, base, is_neox_style,
+                    scaling_alpha, dtype)
+            else:
+                rotary_emb = DynamicNTKScalingRotaryEmbedding(
+                    head_size, rotary_dim, max_position, base, is_neox_style,
+                    scaling_factor, dtype)
         elif scaling_type == "yarn":
             scaling_factor = rope_scaling["factor"]
             original_max_position = rope_scaling[
diff --git a/vllm/model_executor/models/hunyuan_v1_moe.py b/vllm/model_executor/models/hunyuan_v1_moe.py
new file mode 100644
index 0000000000000..89ca3e8a6071b
--- /dev/null
+++ b/vllm/model_executor/models/hunyuan_v1_moe.py
@@ -0,0 +1,897 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# coding=utf-8
+# Copyright 2024 The HunYuan team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only HunYuan model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import regex as re
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_pp_group,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers
+
+
+def _get_cla_factor(config: PretrainedConfig) -> int:
+    if not getattr(config, "use_cla", False):
+        return 1
+    return getattr(config, "cla_share_factor", 1)
+
+
+class HunYuanMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+        reduce_results: bool = True,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=hidden_size,
+            output_sizes=[intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+            reduce_results=reduce_results,
+        )
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class HunYuanAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
+        layer_id: int = -1,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        if hasattr(config, "head_dim"):
+            self.head_dim = config.head_dim
+        elif hasattr(config, "attention_head_dim"):
+            self.head_dim = config.attention_head_dim
+        else:
+            self.head_dim = self.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.use_qk_norm = getattr(config, "use_qk_norm", False)
+        self.layer_id = layer_id
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=True,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+        if self.use_qk_norm:
+            self.query_layernorm = RMSNorm(self.head_dim,
+                                           eps=config.rms_norm_eps)
+            self.key_layernorm = RMSNorm(self.head_dim,
+                                         eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_states: Optional[tuple[torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        ori_k = k
+        if self.use_qk_norm:
+            q = self.query_layernorm(
+                q.view(-1, self.num_heads, self.head_dim).contiguous())
+            k = self.key_layernorm(
+                k.view(-1, self.num_kv_heads, self.head_dim).contiguous())
+
+        attn_output = self.attn(q, k, v)
+        # For o_proj
+        attn_output = attn_output.view(q.shape[0], -1)
+        output, _ = self.o_proj(attn_output)
+        return output, (ori_k, v)
+
+
+class HunYuanCrossAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
+        layer_id: int = -1,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        if hasattr(config, "head_dim"):
+            self.head_dim = config.head_dim
+        elif hasattr(config, "attention_head_dim"):
+            self.head_dim = config.attention_head_dim
+        else:
+            self.head_dim = self.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.use_qk_norm = getattr(config, "use_qk_norm", False)
+        self.layer_id = layer_id
+
+        self.q_proj = ColumnParallelLinear(
+            hidden_size,
+            hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.q_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=True,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            attn_type=AttentionType.ENCODER_DECODER,
+        )
+
+        if self.use_qk_norm:
+            self.query_layernorm = RMSNorm(self.head_dim,
+                                           eps=config.rms_norm_eps)
+            self.key_layernorm = RMSNorm(self.head_dim,
+                                         eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_states: Optional[tuple[torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        assert kv_states is not None
+        ori_k, v = kv_states  # use last layer kv,
+        k = ori_k
+        q, _ = self.q_proj(hidden_states)
+        k_tmp = torch.empty_like(k)  # Todo: reduant rotary embedding
+        q, _ = self.rotary_emb(positions, q, k_tmp)
+        if self.use_qk_norm:
+            q = self.query_layernorm(
+                q.view(-1, self.num_heads, self.head_dim).contiguous())
+            k = self.key_layernorm(
+                k.view(-1, self.num_kv_heads, self.head_dim).contiguous())
+
+        attn_output = self.attn(q, k, v)
+        # For o_proj
+        attn_output = attn_output.view(q.shape[0], -1)
+        output, _ = self.o_proj(attn_output)
+        return output, (ori_k, v)
+
+
+class HunYuanSparseMoeBlock(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_id: int = -1,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        if self.tp_size > config.num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.num_experts}.")
+
+        # Get layer_id topk if config.moe_topk is a list
+        if isinstance(config.moe_topk, list):
+            assert layer_id >= 0
+            assert len(config.moe_topk) > layer_id
+            top_k = config.moe_topk[layer_id]
+        else:
+            top_k = config.moe_topk
+
+        # If it is moe, moe_intermediate_size is preferred
+        intermediate_size = config.intermediate_size
+        if config.moe_intermediate_size is not None:
+            intermediate_size = (config.moe_intermediate_size if isinstance(
+                config.moe_intermediate_size, int) else
+                                 config.moe_intermediate_size[layer_id])
+
+        self.experts = FusedMoE(
+            num_experts=config.num_experts,
+            top_k=top_k,
+            hidden_size=config.hidden_size,
+            intermediate_size=intermediate_size,
+            reduce_results=False,
+            renormalize=top_k > 1,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+        )
+
+        self.gate = ReplicatedLinear(config.hidden_size,
+                                     config.num_experts,
+                                     bias=False,
+                                     quant_config=None,
+                                     prefix=f"{prefix}.gate")
+        if config.use_mixed_mlp_moe > 0:
+            # Get layer_id num_shared_expert if config.num_shared_expert is
+            # a list.
+            if isinstance(config.num_shared_expert, list):
+                assert layer_id >= 0
+                assert len(config.num_shared_expert) > layer_id
+                num_shared_expert = config.num_shared_expert[layer_id]
+            else:
+                num_shared_expert = config.num_shared_expert
+
+            self.shared_mlp = HunYuanMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size * num_shared_expert,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+            )
+        else:
+            self.shared_mlp = None
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        shared_output = None
+        if self.shared_mlp is not None:
+            shared_output = self.shared_mlp(hidden_states)
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        final_hidden_states = self.experts(hidden_states=hidden_states,
+                                           router_logits=router_logits)
+        if shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
+
+        return final_hidden_states.view(orig_shape)
+
+
+class HunYuanDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        layer_id: int = -1,
+    ) -> None:
+        super().__init__()
+        assert layer_id >= 0
+        self.layer_id = layer_id
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = (config.intermediate_size if isinstance(
+            config.intermediate_size, int) else
+                                  config.intermediate_size[layer_id])
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False)
+        cla_factor = _get_cla_factor(config)
+        attention_type = (AttentionType.ENCODER_DECODER
+                          if layer_id >= 0 and layer_id % cla_factor != 0 else
+                          AttentionType.DECODER)
+        if attention_type == AttentionType.DECODER:
+            self.self_attn = HunYuanAttention(
+                config=config,
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                num_kv_heads=getattr(config, "num_key_value_heads",
+                                     config.num_attention_heads),
+                rope_theta=rope_theta,
+                rope_scaling=rope_scaling,
+                max_position_embeddings=max_position_embeddings,
+                quant_config=quant_config,
+                bias=attention_bias,
+                cache_config=cache_config,
+                prefix=f"{prefix}.self_attn",
+                layer_id=layer_id,
+            )
+        elif attention_type == AttentionType.ENCODER_DECODER:
+            self.self_attn = HunYuanCrossAttention(
+                config=config,
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                num_kv_heads=getattr(config, "num_key_value_heads",
+                                     config.num_attention_heads),
+                rope_theta=rope_theta,
+                rope_scaling=rope_scaling,
+                max_position_embeddings=max_position_embeddings,
+                quant_config=quant_config,
+                bias=attention_bias,
+                cache_config=cache_config,
+                prefix=f"{prefix}.self_attn",
+                layer_id=layer_id,
+            )
+        else:
+            raise RuntimeError(f"Unsupported attention type: {attention_type}")
+
+        self.mlp = HunYuanSparseMoeBlock(
+            config=config,
+            quant_config=quant_config,
+            layer_id=layer_id,
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        kv_states: Optional[tuple[torch.Tensor]] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states, ori_kv_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_states=kv_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual, ori_kv_states
+
+
+@support_torch_compile
+class HunYuanModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.quant_config = quant_config
+        self.padding_idx = config.pad_token_id
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
+                                            and get_pp_group().is_last_rank):
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: HunYuanDecoderLayer(
+                config=config,
+                layer_id=int(prefix.split(".")[-1]),
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        cla_factor = _get_cla_factor(self.config)
+        prev_kv_states = None
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual, kv_states = layer(
+                positions,
+                hidden_states,
+                residual,
+                prev_kv_states,
+            )
+
+            if (getattr(self.config, "use_cla", False)
+                    and (i - self.start_layer) % cla_factor == 0):
+                prev_kv_states = kv_states
+            else:
+                prev_kv_states = None
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class HunYuanMoEV1ForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        self.config = config
+        self.quant_config = quant_config
+        self.lora_config = lora_config
+
+        self.model = HunYuanModel(vllm_config=vllm_config, prefix="model")
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            if lora_config:
+                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+                quant_config=quant_config,
+            )
+            if config.tie_word_embeddings:
+                self.lm_head.weight = self.model.embed_tokens.weight
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    logit_scale)
+            self.sampler = get_sampler()
+        else:
+            self.lm_head = PPMissingLayer()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.model(input_ids, positions, intermediate_tensors,
+                                  inputs_embeds)
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
+    def _split_qkv_weight(self, qkv: torch.Tensor):
+        num_attention_heads = self.config.num_attention_heads
+        num_kv_heads = getattr(self.config, "num_key_value_heads",
+                               self.config.num_attention_heads)
+        num_key_value_groups = num_attention_heads // num_kv_heads
+        hidden_size = self.config.hidden_size
+
+        if hasattr(self.config, "head_dim"):
+            attention_head_dim = self.config.head_dim
+        elif hasattr(self.config, "attention_head_dim"):
+            attention_head_dim = self.config.attention_head_dim
+        else:
+            attention_head_dim = self.config.hidden_size // num_attention_heads
+
+        qkv = qkv.reshape(num_kv_heads, num_key_value_groups + 2,
+                          attention_head_dim, hidden_size)
+        q, k, v = torch.split(qkv, (num_key_value_groups, 1, 1), dim=1)
+        q = q.reshape(-1, hidden_size)
+        k = k.reshape(-1, hidden_size)
+        v = v.reshape(-1, hidden_size)
+        return torch.concat((q, k, v))
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        cla_factor = _get_cla_factor(self.config)
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+
+        num_attention_heads = self.config.num_attention_heads
+        num_kv_heads = getattr(self.config, "num_key_value_heads",
+                               self.config.num_attention_heads)
+        split_params_mapping = [
+            (".gate_up_proj", ".gate_and_up_proj", 2, [(1, 1), (0, 1)], None),
+            (
+                ".qkv_proj",
+                ".qkv_proj",
+                num_attention_heads + num_kv_heads * 2,
+                [("q", num_attention_heads), ("k", num_kv_heads),
+                 ("v", num_kv_heads)],
+                self._split_qkv_weight,
+            ),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "gate_proj_bias" in name:
+                name = name.replace("gate_proj_bias", "gate_proj.bias")
+            if "up_proj_bias" in name:
+                name = name.replace("up_proj_bias", "up_proj.bias")
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            # With tie_word_embeddings, we can skip lm_head.weight
+            # The weight might appear unnecessarily in the files if the model is
+            # processed with quantization, LoRA, fine-tuning, etc.
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            if self.quant_config is not None and (
+                    scale_name := self.quant_config.get_cache_scale(name)):
+                # Loading kv cache scales for compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = loaded_weight[0]
+                weight_loader(param, loaded_weight)
+                continue
+
+            is_found = False
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if "mlp.experts" in name:
+                    continue
+                # cross layer only have q_proj, skip qkv pack
+                if weight_name == ".q_proj":
+                    match = re.search(r"layers\.\d+", name)
+                    if match:
+                        layer_id = int(match.group(0).split(".")[-1])
+                        if cla_factor > 1 and layer_id % cla_factor != 0:
+                            continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+
+                is_found = True
+                break
+            if is_found:
+                continue
+
+            for (
+                    param_name,
+                    weight_name,
+                    den,
+                    split_param,
+                    func,
+            ) in split_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                assert loaded_weight.shape[0] % den == 0
+                units = loaded_weight.shape[0] // den
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                offset = 0
+                for shard_id, num in split_param:
+                    new_offset = offset + num * units
+                    if func:
+                        weight_loader(param,
+                                      func(loaded_weight)[offset:new_offset],
+                                      shard_id)
+                    else:
+                        weight_loader(param, loaded_weight[offset:new_offset],
+                                      shard_id)
+                    offset = new_offset
+
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    if "mlp.gate.wg." in name:
+                        name = name.replace("wg.", "")
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 65a1676672393..5d0a1d77339ab 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -73,6 +73,7 @@ _TEXT_GENERATION_MODELS = {
     "GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"),   # noqa: E501
     "GritLM": ("gritlm", "GritLM"),
     "Grok1ModelForCausalLM": ("grok1", "Grok1ForCausalLM"),
+    "HunYuanMoEV1ForCausalLM": ("hunyuan_v1_moe", "HunYuanMoEV1ForCausalLM"),
     "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
     "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
     "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"),

From 0e96cc9b7e473afda794fa3c32e83c391d9a3d27 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 1 Jul 2025 07:55:32 -0700
Subject: [PATCH 020/167] [Misc] Minor refactoring for scheduler (#20299)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/core/sched/scheduler.py | 51 +++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 15 deletions(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 20a40d74f3118..263eec777a84a 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -580,6 +580,13 @@ class Scheduler(SchedulerInterface):
             batch = KVEventBatch(ts=time.time(), events=events)
             self.kv_event_publisher.publish(batch)
 
+        self._update_after_schedule(scheduler_output)
+        return scheduler_output
+
+    def _update_after_schedule(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> None:
         # Advance the number of computed tokens for the request AFTER
         # the request is scheduled.
         # 1. The scheduler_output of the current step has to include the
@@ -589,11 +596,15 @@ class Scheduler(SchedulerInterface):
         #    scheduling step.
         # 3. If some tokens (e.g. spec tokens) are rejected later, the number of
         #    computed tokens will be adjusted in update_from_output.
+        num_scheduled_tokens = scheduler_output.num_scheduled_tokens
         for req_id, num_scheduled_token in num_scheduled_tokens.items():
-            self.requests[req_id].num_computed_tokens += num_scheduled_token
+            request = self.requests[req_id]
+            request.num_computed_tokens += num_scheduled_token
 
+        # Clear the finished request IDs.
+        # NOTE: We shouldn't do self.finished_req_ids.clear() here because
+        # it will also affect the scheduler output.
         self.finished_req_ids = set()
-        return scheduler_output
 
     def _make_cached_request_data(
         self,
@@ -763,19 +774,10 @@ class Scheduler(SchedulerInterface):
                     num_draft_tokens=len(scheduled_spec_token_ids),
                     num_accepted_tokens=len(generated_token_ids) - 1)
 
-            cached_encoder_input_ids = (
-                self.encoder_cache_manager.get_cached_input_ids(request))
-            # OPTIMIZATION: Avoid list(set) if the set is empty.
-            if cached_encoder_input_ids:
-                for input_id in list(cached_encoder_input_ids):
-                    mm_positions = request.mm_positions[input_id]
-                    start_pos = mm_positions.offset
-                    num_tokens = mm_positions.length
-                    if start_pos + num_tokens <= request.num_computed_tokens:
-                        # The encoder output is already processed and stored
-                        # in the decoder's KV cache.
-                        self.encoder_cache_manager.free_encoder_input(
-                            request, input_id)
+            # NOTE(woosuk): This has to be executed after updating
+            # `request.num_computed_tokens`.
+            if request.has_encoder_inputs:
+                self._free_encoder_inputs(request)
 
             stopped = False
             new_logprobs = None
@@ -891,6 +893,25 @@ class Scheduler(SchedulerInterface):
 
         return engine_core_outputs
 
+    def _free_encoder_inputs(self, request: Request) -> None:
+        cached_encoder_input_ids = (
+            self.encoder_cache_manager.get_cached_input_ids(request))
+        # OPTIMIZATION: Avoid list(set) if the set is empty.
+        if not cached_encoder_input_ids:
+            return
+
+        # Here, we use list(set) to avoid modifying the set while iterating
+        # over it.
+        for input_id in list(cached_encoder_input_ids):
+            mm_positions = request.mm_positions[input_id]
+            start_pos = mm_positions.offset
+            num_tokens = mm_positions.length
+            if start_pos + num_tokens <= request.num_computed_tokens:
+                # The encoder output is already processed and stored
+                # in the decoder's KV cache.
+                self.encoder_cache_manager.free_encoder_input(
+                    request, input_id)
+
     def get_request_counts(self) -> tuple[int, int]:
         """Returns (num_running_reqs, num_waiting_reqs)."""
         return len(self.running), len(self.waiting)

From 314af8617c628be1eeb89626bff1be5bc8f81e6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Tue, 1 Jul 2025 17:47:13 +0200
Subject: [PATCH 021/167] [Docs] Update transcriptions API to use openai client
 with `stream=True`  (#20271)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 .../openai_transcription_client.py            | 61 +++++++++----------
 vllm/entrypoints/openai/protocol.py           |  7 +--
 2 files changed, 31 insertions(+), 37 deletions(-)

diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py
index 755038a761396..0d1d73fb14a3a 100644
--- a/examples/online_serving/openai_transcription_client.py
+++ b/examples/online_serving/openai_transcription_client.py
@@ -19,10 +19,8 @@ The script performs:
 """
 
 import asyncio
-import json
 
-import httpx
-from openai import OpenAI
+from openai import AsyncOpenAI, OpenAI
 
 from vllm.assets.audio import AudioAsset
 
@@ -47,37 +45,30 @@ def sync_openai(audio_path: str, client: OpenAI):
         print("transcription result:", transcription.text)
 
 
-async def stream_openai_response(audio_path: str, base_url: str, api_key: str):
+async def stream_openai_response(audio_path: str, client: AsyncOpenAI):
     """
-    Perform streaming transcription using vLLM's raw HTTP streaming API.
+    Perform asynchronous transcription using OpenAI-compatible API.
     """
-    data = {
-        "language": "en",
-        "stream": True,
-        "model": "openai/whisper-large-v3",
-    }
-    url = base_url + "/audio/transcriptions"
-    headers = {"Authorization": f"Bearer {api_key}"}
-    print("transcription result:", end=" ")
-    # OpenAI Transcription API client does not support streaming.
-    async with httpx.AsyncClient() as client:
-        with open(audio_path, "rb") as f:
-            async with client.stream(
-                "POST", url, files={"file": f}, data=data, headers=headers
-            ) as response:
-                async for line in response.aiter_lines():
-                    # Each line is a JSON object prefixed with 'data: '
-                    if line:
-                        if line.startswith("data: "):
-                            line = line[len("data: ") :]
-                        # Last chunk, stream ends
-                        if line.strip() == "[DONE]":
-                            break
-                        # Parse the JSON response
-                        chunk = json.loads(line)
-                        # Extract and print the content
-                        content = chunk["choices"][0].get("delta", {}).get("content")
-                        print(content, end="")
+    print("\ntranscription result:", end=" ")
+    with open(audio_path, "rb") as f:
+        transcription = await client.audio.transcriptions.create(
+            file=f,
+            model="openai/whisper-large-v3",
+            language="en",
+            response_format="json",
+            temperature=0.0,
+            # Additional sampling params not provided by OpenAI API.
+            extra_body=dict(
+                seed=420,
+                top_p=0.6,
+            ),
+            stream=True,
+        )
+        async for chunk in transcription:
+            if chunk.choices:
+                content = chunk.choices[0].get("delta", {}).get("content")
+                print(content, end="", flush=True)
+
     print()  # Final newline after stream ends
 
 
@@ -95,7 +86,11 @@ def main():
 
     sync_openai(mary_had_lamb, client)
     # Run the asynchronous function
-    asyncio.run(stream_openai_response(winning_call, openai_api_base, openai_api_key))
+    client = AsyncOpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    asyncio.run(stream_openai_response(winning_call, client))
 
 
 if __name__ == "__main__":
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 3b5281962b2d9..27d75b2f1fea3 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1750,12 +1750,11 @@ class TranscriptionRequest(OpenAIBaseModel):
     timestamps incurs additional latency.
     """
 
-    # --8<-- [start:transcription-extra-params]
     stream: Optional[bool] = False
-    """Custom field not present in the original OpenAI definition. When set,
-    it will enable output to be streamed in a similar fashion as the Chat
-    Completion endpoint.
+    """When set, it will enable output to be streamed in a similar fashion 
+    as the Chat Completion endpoint.
     """
+    # --8<-- [start:transcription-extra-params]
     # Flattened stream option to simplify form data.
     stream_include_usage: Optional[bool] = False
     stream_continuous_usage_stats: Optional[bool] = False

From 8acb4badee6421fa0dd6892396e58078d25bbb6e Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 1 Jul 2025 09:07:36 -0700
Subject: [PATCH 022/167] [CUDA graphs] Enable full cuda graphs with FA3 AoT
 scheduling (#20301)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 cmake/external_projects/vllm_flash_attn.cmake |  2 +-
 vllm/v1/attention/backends/flash_attn.py      | 59 +++++++++++++++++--
 2 files changed, 54 insertions(+), 7 deletions(-)

diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 7b17018f65ab4..ef45a5fbebf69 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 5f3644181c7a15345ce20bfc65af117d3601b524
+          GIT_TAG 1c2624e53c078854e0637ee566c72fe2107e75f4
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 527b31153410b..6182b2f9b2bd4 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -36,6 +36,9 @@ if TYPE_CHECKING:
 
 logger = init_logger(__name__)
 
+# NOTE(woosuk): This is an arbitrary number. Tune it if needed.
+_DEFAULT_MAX_NUM_SPLITS_FOR_CUDA_GRAPH = 16
+
 
 class FlashAttentionBackend(AttentionBackend):
 
@@ -114,6 +117,7 @@ class FlashAttentionMetadata:
     # Optional aot scheduling
     scheduler_metadata: Optional[torch.Tensor] = None
     prefix_scheduler_metadata: Optional[torch.Tensor] = None
+    max_num_splits: int = 0
 
     # for local attention
     @dataclass
@@ -158,15 +162,35 @@ class FlashAttentionMetadataBuilder(
         self.kv_cache_spec = kv_cache_spec
         self.block_table = block_table
 
+        self.max_num_splits = 0  # No upper bound on the number of splits.
         self.aot_schedule = (get_flash_attn_version() == 3)
         self.use_full_cuda_graph = compilation_config.full_cuda_graph
         if self.use_full_cuda_graph:
-            # NOTE(lucas): AOT scheduling not supported in full cuda graph mode
-            #  yet. This is because the scheduler and kernel need to always use
-            #  the same num_splits (which acts as an upper bound with the
-            #  dynamic split scheduler) which is currently heuristically decided
-            #  by the kernel launching code.
-            self.aot_schedule = False
+            if not self.aot_schedule:
+                raise ValueError(
+                    "AoT scheduling is required for full cuda graph.")
+            capture_sizes = compilation_config.cudagraph_capture_sizes
+            if not capture_sizes:
+                raise ValueError(
+                    "cudagraph_capture_sizes should not be None when "
+                    "full_cuda_graph is True.")
+            self.max_cudagraph_size = max(capture_sizes)
+            if self.max_cudagraph_size > 992:
+                # This condition derives from FA3's internal heuristic.
+                # TODO(woosuk): Support larger cudagraph sizes.
+                raise ValueError(
+                    "Capture size larger than 992 is not supported for "
+                    "full cuda graph.")
+
+            self.scheduler_metadata = torch.zeros(
+                self.runner.max_num_reqs + 1,
+                dtype=torch.int32,
+                device=self.runner.device,
+            )
+            # When using cuda graph, we need to set the upper bound of the
+            # number of splits so that large enough intermediate buffers are
+            # pre-allocated during capture.
+            self.max_num_splits = _DEFAULT_MAX_NUM_SPLITS_FOR_CUDA_GRAPH
 
         # Sliding window size to be used with the AOT scheduler will be
         # populated on first build() call.
@@ -226,6 +250,7 @@ class FlashAttentionMetadataBuilder(
                     cu_seqlens_q=cu_query_lens,
                     causal=causal,
                     window_size=self.aot_sliding_window,
+                    num_splits=self.max_num_splits,
                 )
             return None
 
@@ -302,6 +327,26 @@ class FlashAttentionMetadataBuilder(
                                           max_seq_len=max_seq_len,
                                           causal=True)
 
+        if self.use_full_cuda_graph:
+            assert scheduler_metadata is not None
+            n = scheduler_metadata.shape[0]
+            self.scheduler_metadata[:n] = scheduler_metadata
+            # NOTE(woosuk): We should zero out the rest of the scheduler
+            # metadata to guarantee the correctness. Otherwise, some thread
+            # blocks may use the invalid scheduler metadata and overwrite the
+            # output buffer.
+            self.scheduler_metadata[n:] = 0
+            scheduler_metadata = self.scheduler_metadata[:n]
+
+        max_num_splits = 0
+        if (self.use_full_cuda_graph
+                and num_actual_tokens <= self.max_cudagraph_size):
+            # NOTE(woosuk): Setting num_splits > 1 may increase the memory
+            # usage, because the intermediate buffers of size [num_splits,
+            # num_heads, num_tokens, head_size] are allocated. Therefore,
+            # we only set num_splits when using cuda graphs.
+            max_num_splits = self.max_num_splits
+
         attn_metadata = FlashAttentionMetadata(
             num_actual_tokens=num_actual_tokens,
             max_query_len=max_query_len,
@@ -318,6 +363,7 @@ class FlashAttentionMetadataBuilder(
             suffix_kv_lens=suffix_kv_lens,
             local_attn_metadata=local_attn_metadata,
             prefix_scheduler_metadata=prefix_scheduler_metadata,
+            max_num_splits=max_num_splits,
         )
         return attn_metadata
 
@@ -510,6 +556,7 @@ class FlashAttentionImpl(AttentionImpl):
                 q_descale=layer._q_scale.expand(descale_shape),
                 k_descale=layer._k_scale.expand(descale_shape),
                 v_descale=layer._v_scale.expand(descale_shape),
+                num_splits=attn_metadata.max_num_splits,
             )
             return output
 

From 3d19d47d91b1d06a24a1bd7b6f0626a09cc18dce Mon Sep 17 00:00:00 2001
From: Shintarou Okada <kokuzen@gmail.com>
Date: Wed, 2 Jul 2025 01:47:38 +0900
Subject: [PATCH 023/167] [Frontend] Expand tools even if tool_choice="none"
 (#17177)

Signed-off-by: okada shintarou <okada@preferred.jp>
---
 docs/features/tool_calling.md           |  8 ++++++++
 vllm/entrypoints/openai/api_server.py   |  2 ++
 vllm/entrypoints/openai/cli_args.py     | 11 +++++++++++
 vllm/entrypoints/openai/protocol.py     |  4 +---
 vllm/entrypoints/openai/serving_chat.py | 24 +++++++++++++++++++++---
 5 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 41a024ba632e8..2aeb022d25745 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -99,6 +99,14 @@ vLLM supports the `tool_choice='required'` option in the chat completion API. Si
 
 When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter.
 
+## None Function Calling
+
+vLLM supports the `tool_choice='none'` option in the chat completion API. When this option is set, the model will not generate any tool calls and will respond with regular text content only, even if tools are defined in the request.
+
+By default, when `tool_choice='none'` is specified, vLLM excludes tool definitions from the prompt to optimize context usage. To include tool definitions even with `tool_choice='none'`, use the `--expand-tools-even-if-tool-choice-none` option.
+
+Note: This behavior will change in v0.10.0, where tool definitions will be included by default even with `tool_choice='none'`.
+
 ## Automatic Function Calling
 
 To enable this feature, you should set the following flags:
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index f3fd154862711..6da9935efb30c 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1280,6 +1280,8 @@ async def init_app_state(
         chat_template_content_format=args.chat_template_content_format,
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
         enable_auto_tools=args.enable_auto_tool_choice,
+        expand_tools_even_if_tool_choice_none=args.
+        expand_tools_even_if_tool_choice_none,
         tool_parser=args.tool_call_parser,
         reasoning_parser=args.reasoning_parser,
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index f9bec84518688..4f8aaab772fd4 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -223,6 +223,17 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         default=False,
         help="Enable auto tool choice for supported models. Use "
         "``--tool-call-parser`` to specify which parser to use.")
+    parser.add_argument(
+        "--expand-tools-even-if-tool-choice-none",
+        action="store_true",
+        default=False,
+        deprecated=True,
+        help="Include tool definitions in prompts "
+        "even when tool_choice='none'. "
+        "This is a transitional option that will be removed in v0.10.0. "
+        "In v0.10.0, tool definitions will always be included regardless of "
+        "tool_choice setting. Use this flag now to test the new behavior "
+        "before the breaking change.")
 
     valid_tool_parsers = ToolParserManager.tool_parsers.keys()
     parser.add_argument(
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 27d75b2f1fea3..3df11db33384b 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -686,10 +686,8 @@ class ChatCompletionRequest(OpenAIBaseModel):
         if "tool_choice" not in data and data.get("tools"):
             data["tool_choice"] = "auto"
 
-        # if "tool_choice" is "none" -- ignore tools if present
+        # if "tool_choice" is "none" -- no validation is needed for tools
         if "tool_choice" in data and data["tool_choice"] == "none":
-            # ensure that no tools are present
-            data.pop("tools", None)
             return data
 
         # if "tool_choice" is specified -- validation
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 10aced83b60bb..299ade4e4d7d6 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -62,6 +62,7 @@ class OpenAIServingChat(OpenAIServing):
         return_tokens_as_token_ids: bool = False,
         reasoning_parser: str = "",
         enable_auto_tools: bool = False,
+        expand_tools_even_if_tool_choice_none: bool = False,
         tool_parser: Optional[str] = None,
         enable_prompt_tokens_details: bool = False,
         enable_force_include_usage: bool = False,
@@ -110,6 +111,8 @@ class OpenAIServingChat(OpenAIServing):
                 raise TypeError("Error: --enable-auto-tool-choice requires "
                                 f"tool_parser:'{tool_parser}' which has not "
                                 "been registered") from e
+        self.expand_tools_even_if_tool_choice_none = (
+            expand_tools_even_if_tool_choice_none)
 
         self.enable_prompt_tokens_details = enable_prompt_tokens_details
         self.enable_force_include_usage = enable_force_include_usage
@@ -175,9 +178,24 @@ class OpenAIServingChat(OpenAIServing):
                     "--enable-auto-tool-choice and --tool-call-parser to be set"
                 )
 
-            tool_dicts = None if request.tools is None else [
-                tool.model_dump() for tool in request.tools
-            ]
+            if request.tools is None:
+                tool_dicts = None
+            elif (request.tool_choice == "none"
+                  and not self.expand_tools_even_if_tool_choice_none):
+                if len(request.tools) > 0:
+                    logger.warning_once(
+                        "Tools are specified but tool_choice is set to 'none' "
+                        "and --expand-tools-even-if-tool-choice-none is not "
+                        "enabled. Tool definitions will be excluded from the "
+                        "prompt. This behavior will change in vLLM v0.10 where "
+                        "tool definitions will be included by default even "
+                        "with tool_choice='none'. To adopt the new behavior "
+                        "now, use --expand-tools-even-if-tool-choice-none. "
+                        "To suppress this warning, either remove tools from "
+                        "the request or set tool_choice to a different value.")
+                tool_dicts = None
+            else:
+                tool_dicts = [tool.model_dump() for tool in request.tools]
 
             (
                 conversation,

From 02cabff207ca68094a73ba21296c82cdbcb1d1a5 Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Tue, 1 Jul 2025 09:48:30 -0700
Subject: [PATCH 024/167] [V1] [ROCm] Enable EP with AITER Fused MoE (#20270)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 vllm/model_executor/layers/fused_moe/layer.py         |  2 +-
 .../layers/fused_moe/rocm_aiter_fused_moe.py          | 11 +++++++++--
 .../compressed_tensors/compressed_tensors_moe.py      |  3 ++-
 vllm/model_executor/layers/quantization/fp8.py        |  4 +++-
 4 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index d6ead084af99c..65a46ba5554bd 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -646,13 +646,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             indices_type=self.topk_indices_dtype)
 
         if self.rocm_aiter_moe_enabled:
-            assert expert_map is None
             return self.rocm_aiter_fused_experts(
                 hidden_states=x,
                 w1=layer.w13_weight,
                 w2=layer.w2_weight,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
+                expert_map=expert_map,
                 activation=activation,
                 apply_router_weight_on_input=apply_router_weight_on_input)
         else:
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index 00f1b1f6b911f..93e20c3477bbe 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -315,7 +315,8 @@ def rocm_aiter_fused_experts(
         w2_scale: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
-        block_shape: Optional[list[int]] = None) -> torch.Tensor:
+        block_shape: Optional[list[int]] = None,
+        expert_map: Optional[torch.Tensor] = None) -> torch.Tensor:
 
     activation_method = (ActivationMethod.SILU
                          if activation == "silu" else ActivationMethod.GELU)
@@ -323,6 +324,11 @@ def rocm_aiter_fused_experts(
     topk_weights = topk_weights.to(torch.float32)
     topk_ids = topk_ids.to(torch.int32)
 
+    if expert_map is not None:
+        expert_mask = (expert_map > -1).to(torch.int32)
+    else:
+        expert_mask = None
+
     # w8a8 per-channel quantization
     if per_channel_quant and apply_router_weight_on_input and use_fp8_w8a8:
         # AITER tkw1 kernel for FP8 models with `apply_router_weight_on_input`
@@ -346,7 +352,7 @@ def rocm_aiter_fused_experts(
             fc2_smooth_scale=None,
             a16=False,
             per_tensor_quant_scale=None,
-            expert_mask=None,
+            expert_mask=expert_mask,
             activation_method=activation_method)
 
     else:
@@ -378,6 +384,7 @@ def rocm_aiter_fused_experts(
             w2,
             topk_weights,
             topk_ids,
+            expert_mask=expert_mask,
             quant_method=quant_method,
             activation_method=activation_method,
             w1_scale=w1_scale,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index fa4ce5668091b..92b82f5a02ffb 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -633,7 +633,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 w1_scale=layer.w13_weight_scale,
                 w2_scale=layer.w2_weight_scale,
                 a1_scale=layer.w13_input_scale,
-                a2_scale=layer.w2_input_scale)
+                a2_scale=layer.w2_input_scale,
+                expert_map=expert_map)
         if self.use_marlin:
             assert activation == "silu", (
                 f"{activation} not supported for Marlin MoE.")
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 60df679a74bda..ead345c794b8b 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -442,6 +442,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
     """
 
     def __init__(self, quant_config: Fp8Config):
+
         from vllm.model_executor.layers.fused_moe import fused_experts
         self.quant_config = quant_config
         self.block_quant = self.quant_config.weight_block_size is not None
@@ -879,7 +880,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                           if self.block_quant else layer.w2_weight_scale),
                 a1_scale=layer.w13_input_scale,
                 a2_scale=layer.w2_input_scale,
-                block_shape=self.quant_config.weight_block_size)
+                block_shape=self.quant_config.weight_block_size,
+                expert_map=expert_map)
         elif self.use_marlin:
             assert activation == "silu", (
                 f"{activation} not supported for Marlin MoE.")

From 7f280d69c98e560427d2cbc9c3c3c13a83510dca Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 1 Jul 2025 11:01:31 -0700
Subject: [PATCH 025/167] [Optimization] Cache sampled token ids in model
 runner (#20291)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/v1/worker/test_gpu_model_runner.py      |  12 +--
 .../v1/shared_storage_connector.py            |   4 +-
 vllm/v1/core/sched/output.py                  |   2 +
 vllm/v1/core/sched/scheduler.py               |  18 +++-
 vllm/v1/worker/gpu_model_runner.py            | 100 ++++++++++++------
 5 files changed, 91 insertions(+), 45 deletions(-)

diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index c739b23b90dc8..c42200b70da30 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -172,7 +172,7 @@ def _is_req_state_block_table_match(model_runner, req_id: str) -> bool:
             req_state.block_ids[0]).all()
 
 
-def test_update_states_new_request(model_runner):
+def test_update_states_new_request(model_runner, dist_init):
     req_id = "req_0"
 
     # new req
@@ -186,7 +186,7 @@ def test_update_states_new_request(model_runner):
     assert _is_req_state_block_table_match(model_runner, req_id)
 
 
-def test_update_states_request_finished(model_runner):
+def test_update_states_request_finished(model_runner, dist_init):
     req_id = "req_0"
 
     # new req
@@ -218,7 +218,7 @@ def test_update_states_request_finished(model_runner):
     assert not _is_req_scheduled(model_runner, req_id)
 
 
-def test_update_states_request_resumed(model_runner):
+def test_update_states_request_resumed(model_runner, dist_init):
     req_id = "req_0"
 
     # new req
@@ -278,7 +278,7 @@ def test_update_states_request_resumed(model_runner):
     assert _is_req_state_block_table_match(model_runner, req_id)
 
 
-def test_get_nans_in_logits(model_runner):
+def test_get_nans_in_logits(model_runner, dist_init):
     req_ids = ("req_0", "req_1")
 
     scheduler_output = _schedule_new_request(*req_ids)
@@ -326,7 +326,7 @@ def test_get_nans_in_logits(model_runner):
     assert result == {'req_0': 2, 'req_1': 0}
 
 
-def test_update_states_no_changes(model_runner):
+def test_update_states_no_changes(model_runner, dist_init):
     req_id = "req_0"
 
     # new req
@@ -359,7 +359,7 @@ def test_update_states_no_changes(model_runner):
     assert _is_req_state_block_table_match(model_runner, req_id)
 
 
-def test_update_states_request_unscheduled(model_runner):
+def test_update_states_request_unscheduled(model_runner, dist_init):
     req_ids = ("req_0", "req_1")
 
     # new reqs
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
index 0bceee19f873d..3c574d0655717 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -307,7 +307,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
         cached_reqs = scheduler_output.scheduled_cached_reqs
         for i, req_id in enumerate(cached_reqs.req_ids):
             num_computed_tokens = cached_reqs.num_computed_tokens[i]
-            new_token_ids = cached_reqs.new_token_ids[i]
+            num_new_tokens = scheduler_output.num_scheduled_tokens[req_id]
             new_block_ids = cached_reqs.new_block_ids[i]
             resumed_from_preemption = cached_reqs.resumed_from_preemption[i]
 
@@ -320,7 +320,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
                 # list of token ids (only new tokens). So we look it
                 # up in the actual request object.
                 request = self._requests_need_load[req_id]
-                total_tokens = (len(new_token_ids) + num_computed_tokens)
+                total_tokens = num_computed_tokens + num_new_tokens
                 token_ids = request.all_token_ids[:total_tokens]
 
                 # NOTE(rob): For resumed req, new_block_ids is all
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index efc5b3012ec2f..d34f393278053 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -88,6 +88,8 @@ class CachedRequestData:
     # the request's block IDs. If True, new_block_ids will be used as the
     # request's block IDs instead of appending to the existing block IDs.
     resumed_from_preemption: list[bool]
+    # NOTE(woosuk): new_token_ids is only used for pipeline parallelism.
+    # When PP is not used, new_token_ids will be empty.
     new_token_ids: list[list[int]]
     new_block_ids: list[tuple[list[int], ...]]
     num_computed_tokens: list[int]
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 263eec777a84a..fe552db74e2f2 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -55,6 +55,7 @@ class Scheduler(SchedulerInterface):
         self.lora_config = vllm_config.lora_config
         self.kv_cache_config = kv_cache_config
         self.kv_events_config = vllm_config.kv_events_config
+        self.parallel_config = vllm_config.parallel_config
         self.log_stats = log_stats
         self.structured_output_manager = structured_output_manager
 
@@ -87,7 +88,7 @@ class Scheduler(SchedulerInterface):
 
         self.kv_event_publisher = EventPublisherFactory.create(
             self.kv_events_config,
-            vllm_config.parallel_config.data_parallel_rank,
+            self.parallel_config.data_parallel_rank,
         )
 
         num_gpu_blocks = self.cache_config.num_gpu_blocks
@@ -159,6 +160,7 @@ class Scheduler(SchedulerInterface):
             log_stats=self.log_stats,
             enable_kv_cache_events=self.enable_kv_cache_events,
         )
+        self.use_pp = self.parallel_config.pipeline_parallel_size > 1
 
     def schedule(self) -> SchedulerOutput:
         # NOTE(woosuk) on the scheduling algorithm:
@@ -214,7 +216,7 @@ class Scheduler(SchedulerInterface):
             # This is necessary when using spec decoding.
             num_new_tokens = min(
                 num_new_tokens,
-                self.max_model_len - request.num_computed_tokens)
+                self.max_model_len - 1 - request.num_computed_tokens)
 
             # Schedule encoder inputs.
             encoder_inputs_to_schedule = None
@@ -624,9 +626,15 @@ class Scheduler(SchedulerInterface):
             req_ids.append(req_id)
             num_tokens = (num_scheduled_tokens[req_id] -
                           len(spec_decode_tokens.get(req_id, ())))
-            token_ids = req.all_token_ids[req.num_computed_tokens:req.
-                                          num_computed_tokens + num_tokens]
-            new_token_ids.append(token_ids)
+            if self.use_pp:
+                # When using PP, the scheduler sends the sampled tokens back,
+                # because there's no direct communication between the first-
+                # stage worker and the last-stage worker. Otherwise, we don't
+                # need to send the sampled tokens back because the model runner
+                # will cache them.
+                token_ids = req.all_token_ids[req.num_computed_tokens:req.
+                                              num_computed_tokens + num_tokens]
+                new_token_ids.append(token_ids)
             new_block_ids.append(req_to_new_block_ids[req_id])
             num_computed_tokens.append(req.num_computed_tokens)
         # Because resumed_reqs is usually empty, it is more efficient to do
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 5bdaf4b969e70..df9d69006fc57 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -470,26 +470,33 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             req_ids_to_add.append(req_id)
 
         # Update the states of the running/resumed requests.
+        is_last_rank = get_pp_group().is_last_rank
         req_data = scheduler_output.scheduled_cached_reqs
         for i, req_id in enumerate(req_data.req_ids):
             req_state = self.requests[req_id]
             num_computed_tokens = req_data.num_computed_tokens[i]
-            new_token_ids = req_data.new_token_ids[i]
             new_block_ids = req_data.new_block_ids[i]
             resumed_from_preemption = req_data.resumed_from_preemption[i]
 
             # Update the cached states.
             req_state.num_computed_tokens = num_computed_tokens
-            # Add the sampled token(s) from the previous step (if any).
-            # This doesn't include "unverified" tokens like spec decode tokens.
-            num_new_tokens = (num_computed_tokens + len(new_token_ids) -
-                              req_state.num_tokens)
-            if num_new_tokens == 1:
-                # Avoid slicing list in most common case.
-                req_state.output_token_ids.append(new_token_ids[-1])
-            elif num_new_tokens > 0:
-                req_state.output_token_ids.extend(
-                    new_token_ids[-num_new_tokens:])
+
+            if not is_last_rank:
+                # When using PP, the scheduler sends the sampled tokens back,
+                # because there's no direct communication between the first-
+                # stage worker and the last-stage worker.
+                new_token_ids = req_data.new_token_ids[i]
+                # Add the sampled token(s) from the previous step (if any).
+                # This doesn't include "unverified" tokens like spec tokens.
+                num_new_tokens = (num_computed_tokens + len(new_token_ids) -
+                                  req_state.num_tokens)
+                if num_new_tokens == 1:
+                    # Avoid slicing list in most common case.
+                    req_state.output_token_ids.append(new_token_ids[-1])
+                elif num_new_tokens > 0:
+                    req_state.output_token_ids.extend(
+                        new_token_ids[-num_new_tokens:])
+
             # Update the block IDs.
             if not resumed_from_preemption:
                 # Append the new blocks to the existing block IDs.
@@ -513,22 +520,30 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             self.input_batch.num_computed_tokens_cpu[req_index] = (
                 num_computed_tokens)
             self.input_batch.block_table.append_row(new_block_ids, req_index)
-            # Add new_token_ids to token_ids_cpu.
-            start_token_index = num_computed_tokens
-            end_token_index = num_computed_tokens + len(new_token_ids)
-            self.input_batch.token_ids_cpu[
-                req_index, start_token_index:end_token_index] = new_token_ids
-            self.input_batch.num_tokens_no_spec[req_index] = end_token_index
-            # Add spec_token_ids to token_ids_cpu.
-            spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
-                req_id, ())
-            if spec_token_ids:
-                start_index = end_token_index
-                end_token_index += len(spec_token_ids)
+
+            # For the last rank, we don't need to update the token_ids_cpu
+            # because the sampled tokens are already cached.
+            if not is_last_rank:
+                # Add new_token_ids to token_ids_cpu.
+                start_token_index = num_computed_tokens
+                end_token_index = num_computed_tokens + len(new_token_ids)
                 self.input_batch.token_ids_cpu[
-                    req_index, start_index:end_token_index] = spec_token_ids
-            # NOTE(woosuk): `num_tokens` here may include spec decode tokens.
-            self.input_batch.num_tokens[req_index] = end_token_index
+                    req_index,
+                    start_token_index:end_token_index] = new_token_ids
+                self.input_batch.num_tokens_no_spec[
+                    req_index] = end_token_index
+                # Add spec_token_ids to token_ids_cpu.
+                spec_token_ids = (
+                    scheduler_output.scheduled_spec_decode_tokens.get(
+                        req_id, ()))
+                if spec_token_ids:
+                    start_index = end_token_index
+                    end_token_index += len(spec_token_ids)
+                    self.input_batch.token_ids_cpu[
+                        req_index,
+                        start_index:end_token_index] = spec_token_ids
+                # NOTE(woosuk): `num_tokens` here may include spec tokens.
+                self.input_batch.num_tokens[req_index] = end_token_index
 
         # Check if the batch has changed. If not, we can skip copying the
         # sampling metadata from CPU to GPU.
@@ -1509,6 +1524,30 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         for i in discard_sampled_tokens_req_indices:
             valid_sampled_token_ids[i].clear()
 
+        # Cache the sampled tokens in the model runner, so that the scheduler
+        # doesn't need to send them back.
+        # NOTE(woosuk): As an exception, when using PP, the scheduler sends
+        # the sampled tokens back, because there's no direct communication
+        # between the first-stage worker and the last-stage worker.
+        for req_idx, sampled_ids in enumerate(valid_sampled_token_ids):
+            if not sampled_ids:
+                continue
+
+            start_idx = self.input_batch.num_tokens_no_spec[req_idx]
+            end_idx = start_idx + len(sampled_ids)
+            assert end_idx <= self.max_model_len, (
+                "Sampled token IDs exceed the max model length. "
+                f"Total number of tokens: {end_idx} > max_model_len: "
+                f"{self.max_model_len}")
+
+            self.input_batch.token_ids_cpu[req_idx,
+                                           start_idx:end_idx] = sampled_ids
+            self.input_batch.num_tokens_no_spec[req_idx] = end_idx
+            self.input_batch.num_tokens[req_idx] = end_idx
+            req_id = self.input_batch.req_ids[req_idx]
+            req_state = self.requests[req_id]
+            req_state.output_token_ids.extend(sampled_ids)
+
         if not self.speculative_config:
             # Speculative decoding is not enabled.
             spec_token_ids = None
@@ -1730,17 +1769,14 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 draft_token_ids.append([])
                 continue
 
-            # Add sampled_token_ids to token_ids_cpu.
-            start_idx = self.input_batch.num_tokens_no_spec[i]
-            end_idx = start_idx + num_sampled_ids
-            if end_idx >= self.max_model_len:
+            num_tokens = self.input_batch.num_tokens_no_spec[i]
+            if num_tokens >= self.max_model_len:
                 # Skip requests that have already reached the max model length.
                 draft_token_ids.append([])
                 continue
 
-            self.input_batch.token_ids_cpu[i, start_idx:end_idx] = sampled_ids
             drafter_output = self.drafter.propose(
-                self.input_batch.token_ids_cpu[i, :end_idx])
+                self.input_batch.token_ids_cpu[i, :num_tokens])
             if drafter_output is None or len(drafter_output) == 0:
                 draft_token_ids.append([])
             else:

From 9290de5667c92450768c79a50b748d358bc825c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=91=A8=E5=91=A8=E5=91=A8?=
 <39978853+zhoutianzi666@users.noreply.github.com>
Date: Wed, 2 Jul 2025 08:51:52 +0800
Subject: [PATCH 026/167] remove unused variables in marlin_template.h (#20236)

---
 csrc/moe/marlin_moe_wna16/marlin_template.h     | 2 --
 csrc/quantization/gptq_marlin/marlin_template.h | 2 --
 2 files changed, 4 deletions(-)

diff --git a/csrc/moe/marlin_moe_wna16/marlin_template.h b/csrc/moe/marlin_moe_wna16/marlin_template.h
index 1c255396099d5..8a913bb4a738c 100644
--- a/csrc/moe/marlin_moe_wna16/marlin_template.h
+++ b/csrc/moe/marlin_moe_wna16/marlin_template.h
@@ -1255,8 +1255,6 @@ __global__ void Marlin(
     if constexpr (has_zp && !is_zp_float) {
       if (is_new_zp) {
         if constexpr (group_blocks == -1) is_first_matmul_in_slice = false;
-        FragB frag_zp_0;
-        FragB frag_zp_1;
         int zp_quant_0, zp_quant_1;
 
         if constexpr (w_type.size_bits() == 4) {
diff --git a/csrc/quantization/gptq_marlin/marlin_template.h b/csrc/quantization/gptq_marlin/marlin_template.h
index e416d5a76a410..008663385707b 100644
--- a/csrc/quantization/gptq_marlin/marlin_template.h
+++ b/csrc/quantization/gptq_marlin/marlin_template.h
@@ -1113,8 +1113,6 @@ __global__ void Marlin(
     if constexpr (has_zp && !is_zp_float) {
       if (is_new_zp) {
         if constexpr (group_blocks == -1) is_first_matmul_in_slice = false;
-        FragB frag_zp_0;
-        FragB frag_zp_1;
         int zp_quant_0, zp_quant_1;
 
         if constexpr (w_type.size_bits() == 4) {

From e81fbefe8a3fc7acef9ba7656f9da97a579af532 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Tue, 1 Jul 2025 21:05:42 -0400
Subject: [PATCH 027/167] [Refactor] Refactor import utils (#20269)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 pyproject.toml                       |  2 +-
 tests/build_cython.py                |  2 +-
 tools/check_pickle_imports.py        |  3 +--
 vllm/{utils.py => utils/__init__.py} | 14 +++++++-------
 4 files changed, 10 insertions(+), 11 deletions(-)
 rename vllm/{utils.py => utils/__init__.py} (99%)

diff --git a/pyproject.toml b/pyproject.toml
index fb45572d265b7..340abb385657b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -76,7 +76,7 @@ line-length = 80
 "vllm/spec_decode/**/*.py" = ["UP006", "UP035"]
 "vllm/worker/**/*.py" = ["UP006", "UP035"]
 # Python 3.8 typing - skip utils for ROCm
-"vllm/utils.py" = ["UP006", "UP035"]
+"vllm/utils/__init__.py" = ["UP006", "UP035"]
 
 [tool.ruff.lint]
 select = [
diff --git a/tests/build_cython.py b/tests/build_cython.py
index f4a334aa3b484..444434e8f0a79 100644
--- a/tests/build_cython.py
+++ b/tests/build_cython.py
@@ -25,7 +25,7 @@ infiles += [
 infiles += [
     "vllm/model_executor/layers/sampler.py",
     "vllm/sampling_params.py",
-    "vllm/utils.py",
+    "vllm/utils/__init__.py",
 ]
 
 setup(ext_modules=cythonize(infiles,
diff --git a/tools/check_pickle_imports.py b/tools/check_pickle_imports.py
index 30a2d49dd8ca7..b212df400b25c 100644
--- a/tools/check_pickle_imports.py
+++ b/tools/check_pickle_imports.py
@@ -26,7 +26,6 @@ except ImportError:
 #  add to this list if absolutely necessary and after careful security review.
 ALLOWED_FILES = set([
     # pickle
-    'vllm/utils.py',
     'vllm/v1/serial_utils.py',
     'vllm/v1/executor/multiproc_executor.py',
     'vllm/multimodal/hasher.py',
@@ -54,7 +53,7 @@ ALLOWED_FILES = set([
     'vllm/entrypoints/llm.py',
     'tests/utils.py',
     # pickle and cloudpickle
-    'vllm/utils.py',
+    'vllm/utils/__init__.py',
     'vllm/v1/serial_utils.py',
     'vllm/v1/executor/multiproc_executor.py',
     'vllm/transformers_utils/config.py',
diff --git a/vllm/utils.py b/vllm/utils/__init__.py
similarity index 99%
rename from vllm/utils.py
rename to vllm/utils/__init__.py
index 60e560c70ad3a..d97d873ccfdd3 100644
--- a/vllm/utils.py
+++ b/vllm/utils/__init__.py
@@ -39,14 +39,14 @@ from argparse import (Action, ArgumentDefaultsHelpFormatter, ArgumentParser,
 from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
 from collections import UserDict, defaultdict
 from collections.abc import (AsyncGenerator, Awaitable, Collection, Generator,
-                             Hashable, Iterable, Iterator, KeysView, Mapping)
+                             Hashable, Iterable, Iterator, KeysView, Mapping,
+                             Sequence)
 from concurrent.futures.process import ProcessPoolExecutor
 from dataclasses import dataclass, field
 from functools import cache, lru_cache, partial, wraps
 from types import MappingProxyType
 from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple,
-                    Optional, Sequence, Tuple, Type, TypeVar, Union, cast,
-                    overload)
+                    Optional, TypeVar, Union, cast, overload)
 from urllib.parse import urlparse
 from uuid import uuid4
 
@@ -1921,9 +1921,9 @@ class LazyDict(Mapping[str, T], Generic[T]):
         return len(self._factory)
 
 
-class ClassRegistry(UserDict[Type[T], _V]):
+class ClassRegistry(UserDict[type[T], _V]):
 
-    def __getitem__(self, key: Type[T]) -> _V:
+    def __getitem__(self, key: type[T]) -> _V:
         for cls in key.mro():
             if cls in self.data:
                 return self.data[cls]
@@ -2234,7 +2234,7 @@ def direct_register_custom_op(
         fake_impl: Optional[Callable] = None,
         target_lib: Optional[Library] = None,
         dispatch_key: str = "CUDA",
-        tags: Tuple[torch.Tag, ...] = (),
+        tags: tuple[torch.Tag, ...] = (),
 ):
     """
     `torch.library.custom_op` can have significant overhead because it
@@ -2489,7 +2489,7 @@ def get_exception_traceback():
     return err_str
 
 
-def split_zmq_path(path: str) -> Tuple[str, str, str]:
+def split_zmq_path(path: str) -> tuple[str, str, str]:
     """Split a zmq path into its parts."""
     parsed = urlparse(path)
     if not parsed.scheme:

From 3abfe2215428cc5cbe10b179d33959c4b19e1183 Mon Sep 17 00:00:00 2001
From: czhu-cohere <conway.zhu@cohere.com>
Date: Tue, 1 Jul 2025 18:05:44 -0700
Subject: [PATCH 028/167] Enable group size 64 for Machete (#20290)

Signed-off-by: czhu-cohere <conway.zhu@cohere.com>
---
 tests/kernels/quantization/test_machete_mm.py |  8 ++++----
 .../kernels/mixed_precision/machete.py        |  6 +++---
 .../quantization/utils/machete_utils.py       | 19 ++++++++++++++++++-
 3 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py
index a4fb9874c4906..a7cb2a4e7f21d 100644
--- a/tests/kernels/quantization/test_machete_mm.py
+++ b/tests/kernels/quantization/test_machete_mm.py
@@ -14,6 +14,8 @@ import torch
 
 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.machete_utils import (
+    query_machete_supported_group_sizes)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     pack_rows, quantize_weights)
 from vllm.platforms import current_platform
@@ -46,8 +48,6 @@ MNK_SHAPES = [
     (1024, 8192, 4096),
 ]
 
-GROUP_SIZES_TO_TEST: list[Optional[int]] = [128, -1]
-
 
 @dataclass
 class TypeConfig:
@@ -270,7 +270,7 @@ def test_machete_all_schedules(shape, types: TypeConfig):
     if types.group_scale_type is None:
         group_sizes = [None]
     else:
-        group_sizes = GROUP_SIZES_TO_TEST
+        group_sizes = query_machete_supported_group_sizes(types.act_type)
 
     for group_size in group_sizes:
         if not group_size_valid(shape, group_size):
@@ -299,7 +299,7 @@ def test_machete_heuristic(shape, types: TypeConfig):
     if types.group_scale_type is None:
         group_sizes = [None]
     else:
-        group_sizes = GROUP_SIZES_TO_TEST
+        group_sizes = query_machete_supported_group_sizes(types.act_type)
 
     for group_size in group_sizes:
         if not group_size_valid(shape, group_size):
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
index a75f3ac8d5033..12eb9d104bf20 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
@@ -8,7 +8,7 @@ import torch
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.machete_utils import (
-    MACHETE_SUPPORTED_GROUP_SIZES, check_machete_supports_shape,
+    check_machete_supports_shape, query_machete_supported_group_sizes,
     query_machete_supported_quant_types)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     pack_quantized_values_into_int32, unpack_quantized_values_into_int32)
@@ -40,10 +40,10 @@ class MacheteLinearKernel(MPLinearKernel):
                            "Machete, supported types are: "\
                            f"{query_machete_supported_quant_types(c.zero_points)}"
 
-        if c.group_size not in MACHETE_SUPPORTED_GROUP_SIZES:
+        if c.group_size not in query_machete_supported_group_sizes(c.act_type):
             return False, f"Group size ({c.group_size}) not supported by "\
                             "Machete, supported group sizes are: "\
-                            f"{MACHETE_SUPPORTED_GROUP_SIZES}"
+                            f"{query_machete_supported_group_sizes(c.act_type)}"
 
         return check_machete_supports_shape(c.partition_weight_shape[0],
                                             c.partition_weight_shape[1])
diff --git a/vllm/model_executor/layers/quantization/utils/machete_utils.py b/vllm/model_executor/layers/quantization/utils/machete_utils.py
index 580c36a0e2fa8..fbb850d227765 100644
--- a/vllm/model_executor/layers/quantization/utils/machete_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/machete_utils.py
@@ -7,7 +7,6 @@ import torch
 
 from vllm.scalar_type import ScalarType, scalar_types
 
-MACHETE_SUPPORTED_GROUP_SIZES = [-1, 128]
 MACHETE_PREPACKED_BLOCK_SHAPE = [64, 128]
 
 
@@ -22,6 +21,24 @@ def query_machete_supported_act_types(zero_points: bool) -> list[ScalarType]:
     return [torch.float16, torch.bfloat16]
 
 
+def query_machete_supported_group_sizes(act_type: torch.dtype) -> list[int]:
+    """
+    Queries the supported group sizes for Machete based on the activation type.
+
+    Args:
+        act_type: The activation data type (torch.float16, torch.bfloat16).
+
+    Returns:
+        A list of supported group sizes. The group size must
+        be divisible by `TileShapeK = 128 * 8 // num_bits(act_type)`.
+        -1 indicates per-channel quantization.
+    """
+    if act_type in [torch.float16, torch.bfloat16]:
+        return [-1, 64, 128]
+    else:
+        return [-1, 128]
+
+
 def check_machete_supports_shape(in_features: int, out_featrues: int) \
     -> tuple[bool, Optional[str]]:
     if in_features % MACHETE_PREPACKED_BLOCK_SHAPE[0] != 0:

From 3be8d312a216727dd3624afa92984dab6e170274 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Tue, 1 Jul 2025 21:05:47 -0400
Subject: [PATCH 029/167] [Kernel][Bugfix] Fixup some warnings in
 nvfp4_blockwise_moe when CUDA < 12.8 (#20324)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
index 45ec3d29ce045..a21ee55b65862 100644
--- a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
+++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
@@ -335,8 +335,10 @@ void run_fp4_blockwise_scaled_group_mm(
   TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
 }
 
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
 constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
 constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
+#endif
 
 #define CHECK_TYPE(x, st, m) \
   TORCH_CHECK(x.scalar_type() == st, ": Inconsistency of Tensor type:", m)

From a0389e0554728b8478503de9a1f350e514635fe1 Mon Sep 17 00:00:00 2001
From: Liangliang Ma <1906710196@qq.com>
Date: Wed, 2 Jul 2025 09:06:04 +0800
Subject: [PATCH 030/167] [UT][intel GPU] use current_platform instead of
 device hardcode in v1 tests (#20169)

Signed-off-by: Ma, Liangliang <liangliang.ma@intel.com>
---
 tests/conftest.py                         |  4 ++--
 tests/v1/sample/test_rejection_sampler.py |  9 +++++----
 tests/v1/sample/test_topk_topp_sampler.py | 11 ++++++-----
 tests/v1/spec_decode/test_eagle.py        | 23 +++++++++++++----------
 tests/v1/worker/test_gpu_input_batch.py   |  4 +++-
 tests/v1/worker/test_gpu_model_runner.py  |  3 ++-
 vllm/platforms/cuda.py                    |  6 +++++-
 vllm/platforms/rocm.py                    |  5 +++++
 vllm/platforms/xpu.py                     |  2 +-
 vllm/v1/attention/backends/mla/common.py  |  3 ++-
 10 files changed, 44 insertions(+), 26 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index feb52e26300a0..b294b50a5cdd0 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -34,7 +34,6 @@ from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
 from vllm.transformers_utils.utils import maybe_model_redirect
-from vllm.utils import cuda_device_count_stateless
 
 logger = init_logger(__name__)
 
@@ -1094,7 +1093,8 @@ def num_gpus_available():
     """Get number of GPUs without initializing the CUDA context
     in current process."""
 
-    return cuda_device_count_stateless()
+    from vllm.platforms import current_platform
+    return current_platform.device_count()
 
 
 temp_dir = tempfile.gettempdir()
diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index f35c3e194fa71..1f2bdb3c5ff62 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -6,12 +6,13 @@ import pytest
 import torch
 import torch.nn.functional as F
 
+from vllm.platforms import current_platform
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import (PLACEHOLDER_TOKEN_ID,
                                               RejectionSampler)
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 
-DEVICE = "cuda"
+DEVICE = current_platform.device_type
 
 
 @pytest.fixture
@@ -21,7 +22,7 @@ def rejection_sampler():
 
 def create_logits_tensor(output_token_ids: list[list[int]],
                          vocab_size: int = 100) -> torch.Tensor:
-    """Helper function to create logits tensor that 
+    """Helper function to create logits tensor that
        will produce desired token ids on argmax"""
     token_ids = [tokens[:-1] for tokens in output_token_ids]
     num_total_tokens = sum(len(tokens) for tokens in token_ids)
@@ -41,8 +42,8 @@ def create_sampling_metadata(
     top_p: Optional[torch.Tensor] = None,
     generators: Optional[dict[int, Any]] = None,
 ) -> SamplingMetadata:
-    """Create a v1 sampling metadata object with all_greedy set 
-        to the given value. Either all greedy or all random sampling 
+    """Create a v1 sampling metadata object with all_greedy set
+        to the given value. Either all greedy or all random sampling
         is used.
     """
     generators = generators or {}
diff --git a/tests/v1/sample/test_topk_topp_sampler.py b/tests/v1/sample/test_topk_topp_sampler.py
index 9d695cd91a972..ccf38c31d39e6 100644
--- a/tests/v1/sample/test_topk_topp_sampler.py
+++ b/tests/v1/sample/test_topk_topp_sampler.py
@@ -2,25 +2,26 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
-from flashinfer.sampling import top_k_renorm_probs, top_p_renorm_probs
 from torch import Generator
 
 from vllm.platforms import current_platform
 from vllm.v1.sample.ops.topk_topp_sampler import (apply_top_k_top_p,
                                                   is_flashinfer_available)
 
-DEVICE = "cuda"
+DEVICE = current_platform.device_type
 
 BATCH_SIZE = 1024
 VOCAB_SIZE = 128 * 1024
 
 FLASHINFER_ENABLED = current_platform.is_cuda() and is_flashinfer_available
+if is_flashinfer_available:
+    from flashinfer.sampling import top_k_renorm_probs, top_p_renorm_probs
 
 
 @pytest.fixture(autouse=True)
 def reset_default_device():
     """
-    Explicitly set the default device, which can affect subsequent tests. 
+    Explicitly set the default device, which can affect subsequent tests.
     Adding this fixture helps avoid this problem.
     """
     original_device = torch.get_default_device()
@@ -58,8 +59,8 @@ def test_flashinfer_sampler():
     This test verifies that the FlashInfer top-k and top-p sampling
     implementation produces the same results as the Python implementation.
 
-    NOTE: FlashInfer did not directly expose an interface for fused top-k and 
-    top-p prob renorm (it did provide fused sampling but we cannot compare 
+    NOTE: FlashInfer did not directly expose an interface for fused top-k and
+    top-p prob renorm (it did provide fused sampling but we cannot compare
     sampling results due to randomness), so we will compare the probability
     renormed consequently by top-k and then top-p of FlashInfer implementation.
     '''
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index c93b7f57c0410..5efab2c14407d 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -10,6 +10,7 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, SpeculativeConfig,
                          VllmConfig)
 from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.platforms import current_platform
 from vllm.v1.spec_decode.eagle import EagleProposer
 
 model_dir = "meta-llama/Llama-3.1-8B-Instruct"
@@ -38,15 +39,17 @@ def _create_proposer(method: str, k: int) -> EagleProposer:
         num_speculative_tokens=k,
     )
 
-    vllm_config = VllmConfig(model_config=model_config,
-                             cache_config=CacheConfig(),
-                             speculative_config=speculative_config,
-                             device_config=DeviceConfig(device="cuda"),
-                             parallel_config=ParallelConfig(),
-                             load_config=LoadConfig(),
-                             scheduler_config=SchedulerConfig())
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        cache_config=CacheConfig(),
+        speculative_config=speculative_config,
+        device_config=DeviceConfig(device=current_platform.device_type),
+        parallel_config=ParallelConfig(),
+        load_config=LoadConfig(),
+        scheduler_config=SchedulerConfig())
 
-    return EagleProposer(vllm_config=vllm_config, device='cuda')
+    return EagleProposer(vllm_config=vllm_config,
+                         device=current_platform.device_type)
 
 
 def test_prepare_inputs():
@@ -59,7 +62,7 @@ def test_prepare_inputs():
                     a, a + 1, ..., a + b - n2 - 1,
                     a + b, a + b + 1, ..., a + b + c - n3 - 1]
     """
-    device = torch.device('cuda')
+    device = torch.device(current_platform.device_type)
 
     # a = 4, b = 7, c = 5
     # n1 = 1, n2 = 3, n3 = 2
@@ -198,7 +201,7 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
 @pytest.mark.parametrize("num_speculative_tokens", [1, 3, 8])
 def test_propose(num_speculative_tokens):
     # Use GPU device
-    device = torch.device('cuda')
+    device = torch.device(current_platform.device_type)
 
     # Setup test parameters
     batch_size = 2
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 9e5e06cdc1f51..59b28e675c252 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -8,6 +8,7 @@ import numpy as np
 import pytest
 import torch
 
+from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
 from vllm.v1.pool.metadata import PoolingMetadata
@@ -19,7 +20,8 @@ VOCAB_SIZE = 1024
 NUM_OUTPUT_TOKENS = 20
 MAX_PROMPT_SIZE = 100
 CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+    f"{current_platform.device_type}:{i}"
+    for i in range(min(current_platform.device_count(), 2))
 ]
 MAX_NUM_PROMPT_TOKENS = 64
 
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index c42200b70da30..2e1deecbd9e67 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -9,6 +9,7 @@ import torch
 from vllm.attention import Attention
 from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
                          SchedulerConfig, VllmConfig, set_current_vllm_config)
+from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 from vllm.utils import GiB_bytes
 from vllm.v1.core.kv_cache_utils import (estimate_max_model_len,
@@ -23,7 +24,7 @@ from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
 BLOCK_SIZE = 16
 NUM_BLOCKS = 10
-DEVICE = "cuda"
+DEVICE = current_platform.device_type
 
 
 def initialize_kv_cache(runner: GPUModelRunner):
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 879d094f65788..15cab757d2c03 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -18,7 +18,7 @@ from typing_extensions import ParamSpec
 import vllm._C  # noqa
 import vllm.envs as envs
 from vllm.logger import init_logger
-from vllm.utils import import_pynvml
+from vllm.utils import cuda_device_count_stateless, import_pynvml
 
 from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
 
@@ -401,6 +401,10 @@ class CudaPlatformBase(Platform):
         pg._register_backend(device, backend_type, backend_class)
         return pg
 
+    @classmethod
+    def device_count(cls) -> int:
+        return cuda_device_count_stateless()
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index ee53a76ceb6db..4550ef570684b 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -12,6 +12,7 @@ from torch.distributed.distributed_c10d import is_nccl_available
 
 import vllm.envs as envs
 from vllm.logger import init_logger
+from vllm.utils import cuda_device_count_stateless
 
 from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
 
@@ -446,3 +447,7 @@ class RocmPlatform(Platform):
 
         pg._register_backend(device, backend_type, backend_class)
         return pg
+
+    @classmethod
+    def device_count(cls) -> int:
+        return cuda_device_count_stateless()
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index f361f5e2616ef..61a0453dcbc87 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -189,4 +189,4 @@ class XPUPlatform(Platform):
 
     @classmethod
     def device_count(cls) -> int:
-        return torch.xpu.device_count()
+        return torch.xpu.device_count()
\ No newline at end of file
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index d45ec04472a69..39379b11863cb 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -217,7 +217,8 @@ try:
     is_vllm_fa = True
 except ImportError:
     # For rocm use upstream flash attention
-    from flash_attn import flash_attn_varlen_func
+    if current_platform.is_rocm():
+        from flash_attn import flash_attn_varlen_func
     is_vllm_fa = False
 
 if TYPE_CHECKING:

From 7058d7dd5d47185c738ad541c564e70b315690fc Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Tue, 1 Jul 2025 22:03:07 -0400
Subject: [PATCH 031/167] [Refactor] Remove duplicate `find_free_port` (#20333)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 tests/kernels/moe/utils.py                    | 4 ++--
 vllm/model_executor/layers/fused_moe/utils.py | 9 ---------
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py
index e4cd8386e1020..e317ccbdb4a77 100644
--- a/tests/kernels/moe/utils.py
+++ b/tests/kernels/moe/utils.py
@@ -14,7 +14,7 @@ from torch.multiprocessing import (
     spawn)  # pyright: ignore[reportPrivateImportUsage]
 from typing_extensions import Concatenate, ParamSpec
 
-from vllm.model_executor.layers.fused_moe.utils import find_free_port
+from vllm.utils import get_open_port
 
 has_deep_ep = importlib.util.find_spec("deep_ep") is not None
 if has_deep_ep:
@@ -95,7 +95,7 @@ def parallel_launch(
             world_size,
             world_size,
             0,
-            f"tcp://{os.getenv('LOCALHOST', 'localhost')}:{find_free_port()}",
+            f"tcp://{os.getenv('LOCALHOST', 'localhost')}:{get_open_port()}",
             worker,
         ) + args,
         nprocs=world_size,
diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py
index 4c91e697f8e97..692482c2ea692 100644
--- a/vllm/model_executor/layers/fused_moe/utils.py
+++ b/vllm/model_executor/layers/fused_moe/utils.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import socket
-from contextlib import closing
 from math import prod
 from typing import Optional
 
@@ -98,10 +96,3 @@ def _fp8_perm(m: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
         return m.view(dtype=torch.uint8)[idx, ...].view(dtype=m.dtype)
     else:
         return m[idx, ...]
-
-
-def find_free_port():
-    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
-        s.bind(('', 0))
-        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-        return s.getsockname()[1]

From 9dae7d46bfce1b6b6e83e36dd6297c2c9fc58736 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Tue, 1 Jul 2025 22:03:43 -0400
Subject: [PATCH 032/167] [Refactor] Remove Unused Env
 `VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON` (#20334)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/envs.py                                               | 7 -------
 .../layers/fused_moe/moe_align_block_size.py               | 1 -
 2 files changed, 8 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index c73dbb0a8446f..0cc6792d72bbd 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -104,7 +104,6 @@ if TYPE_CHECKING:
     VLLM_SERVER_DEV_MODE: bool = False
     VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
     VLLM_MLA_DISABLE: bool = False
-    VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
     VLLM_RAY_PER_WORKER_GPUS: float = 1.0
     VLLM_RAY_BUNDLE_INDICES: str = ""
     VLLM_CUDART_SO_PATH: Optional[str] = None
@@ -769,12 +768,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_MLA_DISABLE":
     lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))),
 
-    # If set, vLLM will use the Triton implementation of moe_align_block_size,
-    # i.e. moe_align_block_size_triton in fused_moe.py.
-    "VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON":
-    lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0"))
-                 ),
-
     # Number of GPUs per worker in Ray, if it is set to be a fraction,
     # it allows ray to schedule multiple actors on a single GPU,
     # so that users can colocate other actors on the same GPUs as vLLM.
diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
index ceb96add0fdee..3aae183dfa200 100644
--- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
+++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
@@ -94,7 +94,6 @@ def moe_align_block_size_stage4(
 
 # Triton implementation based on:
 # https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0
-# TODO(wentao): Deprecated this function in the future.
 def moe_align_block_size_triton(
     topk_ids: torch.Tensor,
     num_experts: int,

From 9ec1e3065abca542754113ea2a17ef17fdc0d9f0 Mon Sep 17 00:00:00 2001
From: Lifans <lifans@meta.com>
Date: Tue, 1 Jul 2025 19:04:24 -0700
Subject: [PATCH 033/167] [Misc][Doc] Add missing comment for LLM (#20285)

Signed-off-by: Lifan Shen <lifans@meta.com>
---
 vllm/entrypoints/llm.py | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index f0404e0bc6eac..93692a0c9772e 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -132,6 +132,14 @@ class LLM:
         hf_overrides: If a dictionary, contains arguments to be forwarded to the
             HuggingFace config. If a callable, it is called to update the
             HuggingFace config.
+        mm_processor_kwargs: Arguments to be forwarded to the model's processor
+            for multi-modal data, e.g., image processor. Overrides for the
+            multi-modal processor obtained from `AutoProcessor.from_pretrained`.
+            The available overrides depend on the model that is being run.
+            For example, for Phi-3-Vision: `{"num_crops": 4}`.
+        override_pooler_config: Initialize non-default pooling config or
+            override default pooling config for the pooling model.
+            e.g. `PoolerConfig(pooling_type="mean", normalize=False)`.
         compilation_config: Either an integer or a dictionary. If it is an
             integer, it is used as the level of compilation optimization. If it
             is a dictionary, it can specify the full compilation configuration.
@@ -1347,16 +1355,16 @@ class LLM:
         during the sleep period, before `wake_up` is called.
 
         Args:
-            level: The sleep level. Level 1 sleep will offload the model 
-                weights and discard the kv cache. The content of kv cache 
+            level: The sleep level. Level 1 sleep will offload the model
+                weights and discard the kv cache. The content of kv cache
                 is forgotten. Level 1 sleep is good for sleeping and waking
-                up the engine to run the same model again. The model weights 
-                are backed up in CPU memory. Please make sure there's enough 
-                CPU memory to store the model weights. Level 2 sleep will 
-                discard both the model weights and the kv cache. The content 
-                of both the model weights and kv cache is forgotten. Level 2 
+                up the engine to run the same model again. The model weights
+                are backed up in CPU memory. Please make sure there's enough
+                CPU memory to store the model weights. Level 2 sleep will
+                discard both the model weights and the kv cache. The content
+                of both the model weights and kv cache is forgotten. Level 2
                 sleep is good for sleeping and waking up the engine to run a
-                different model or update the model, where previous model 
+                different model or update the model, where previous model
                 weights are not needed. It reduces CPU memory pressure.
         """
         self.reset_prefix_cache()
@@ -1366,12 +1374,12 @@ class LLM:
         """
         Wake up the engine from sleep mode. See the [sleep][] method
         for more details.
-        
+
         Args:
-            tags: An optional list of tags to reallocate the engine memory 
-                for specific memory allocations. Values must be in 
+            tags: An optional list of tags to reallocate the engine memory
+                for specific memory allocations. Values must be in
                 `("weights", "kv_cache")`. If None, all memory is reallocated.
-                wake_up should be called with all tags (or None) before the 
+                wake_up should be called with all tags (or None) before the
                 engine is used again.
         """
         self.llm_engine.wake_up(tags)

From 27b8017636c57f9d0cd182fb4287b798c4a7ef28 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Wed, 2 Jul 2025 13:26:40 +0800
Subject: [PATCH 034/167] [FIX][Intel GPU]fix ipex flash_attn_varlen_func api
 missing parameter (#20348)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 vllm/_ipex_ops.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index 2be02411ec05e..7533bf5ef7e60 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -270,6 +270,7 @@ class ipex_ops:
         q_descale=None,
         k_descale=None,
         v_descale=None,
+        num_splits=0,
     ):
         if cu_seqlens_k is None:
             # cu_seqlens_k is not used in ipex kernel.

From 1a03dd496b6daff8d58b41c8feaba4b520adfe1c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 2 Jul 2025 14:31:26 +0800
Subject: [PATCH 035/167] [Bugfix] Fix dynamic rotary embedding (#20343)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/layers/rotary_embedding.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 12b204bb98405..a4615132a5189 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -1963,16 +1963,19 @@ def get_rope(
                                                    scaling_factor, dtype,
                                                    mixed_b)
         elif scaling_type == "dynamic":
-            scaling_factor = rope_scaling["factor"]
-            scaling_alpha = rope_scaling["alpha"]
-            if scaling_alpha:
+            if "alpha" in rope_scaling:
+                scaling_alpha = rope_scaling["alpha"]
                 rotary_emb = DynamicNTKAlphaRotaryEmbedding(
                     head_size, rotary_dim, max_position, base, is_neox_style,
                     scaling_alpha, dtype)
-            else:
+            elif "factor" in rope_scaling:
+                scaling_factor = rope_scaling["factor"]
                 rotary_emb = DynamicNTKScalingRotaryEmbedding(
                     head_size, rotary_dim, max_position, base, is_neox_style,
                     scaling_factor, dtype)
+            else:
+                raise ValueError("Dynamic rope scaling must contain either "
+                                 "'alpha' or 'factor' field")
         elif scaling_type == "yarn":
             scaling_factor = rope_scaling["factor"]
             original_max_position = rope_scaling[

From be0cfb2b680b59b8521024f1c6e5d18dbe08677e Mon Sep 17 00:00:00 2001
From: yyzxw <34639446+yyzxw@users.noreply.github.com>
Date: Wed, 2 Jul 2025 14:32:34 +0800
Subject: [PATCH 036/167] fix[Docs]: link anchor is incorrect #20309 (#20315)

Signed-off-by: zxw <1020938856@qq.com>
---
 docs/configuration/engine_args.md                | 2 +-
 docs/design/arch_overview.md                     | 2 +-
 docs/features/structured_outputs.md              | 2 +-
 docs/getting_started/installation/intel_gaudi.md | 2 +-
 docs/models/generative_models.md                 | 2 +-
 docs/models/pooling_models.md                    | 2 +-
 docs/models/supported_models.md                  | 6 +++---
 docs/serving/openai_compatible_server.md         | 2 +-
 8 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/configuration/engine_args.md b/docs/configuration/engine_args.md
index fb2689a56391b..e02c7090d373d 100644
--- a/docs/configuration/engine_args.md
+++ b/docs/configuration/engine_args.md
@@ -6,7 +6,7 @@ title: Engine Arguments
 Engine arguments control the behavior of the vLLM engine.
 
 - For [offline inference][offline-inference], they are part of the arguments to [LLM][vllm.LLM] class.
-- For [online serving][openai-compatible-server], they are part of the arguments to `vllm serve`.
+- For [online serving][serving-openai-compatible-server], they are part of the arguments to `vllm serve`.
 
 You can look at [EngineArgs][vllm.engine.arg_utils.EngineArgs] and [AsyncEngineArgs][vllm.engine.arg_utils.AsyncEngineArgs] to see the available engine arguments.
 
diff --git a/docs/design/arch_overview.md b/docs/design/arch_overview.md
index 9bfdab17007e3..b2ef76c0e7f86 100644
--- a/docs/design/arch_overview.md
+++ b/docs/design/arch_overview.md
@@ -74,7 +74,7 @@ python -m vllm.entrypoints.openai.api_server --model <model>
 
 That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
 
-More details on the API server can be found in the [OpenAI-Compatible Server][openai-compatible-server] document.
+More details on the API server can be found in the [OpenAI-Compatible Server][serving-openai-compatible-server] document.
 
 ## LLM Engine
 
diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
index b63f344ebd5a5..614b0bfe9679b 100644
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -21,7 +21,7 @@ The following parameters are supported, which must be added as extra parameters:
 - `guided_grammar`: the output will follow the context free grammar.
 - `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text.
 
-You can see the complete list of supported parameters on the [OpenAI-Compatible Server][openai-compatible-server] page.
+You can see the complete list of supported parameters on the [OpenAI-Compatible Server][serving-openai-compatible-server] page.
 
 Structured outputs are supported by default in the OpenAI-Compatible Server. You
 may choose to specify the backend to use by setting the
diff --git a/docs/getting_started/installation/intel_gaudi.md b/docs/getting_started/installation/intel_gaudi.md
index a4f13dca4bf48..d1d544c8359fc 100644
--- a/docs/getting_started/installation/intel_gaudi.md
+++ b/docs/getting_started/installation/intel_gaudi.md
@@ -110,7 +110,7 @@ docker run \
 ### Supported features
 
 - [Offline inference][offline-inference]
-- Online serving via [OpenAI-Compatible Server][openai-compatible-server]
+- Online serving via [OpenAI-Compatible Server][serving-openai-compatible-server]
 - HPU autodetection - no need to manually select device within vLLM
 - Paged KV cache with algorithms enabled for Intel Gaudi accelerators
 - Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md
index 355ed506e5dfd..fd5c659921de3 100644
--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@@ -134,7 +134,7 @@ outputs = llm.chat(conversation, chat_template=custom_template)
 
 ## Online Serving
 
-Our [OpenAI-Compatible Server][openai-compatible-server] provides endpoints that correspond to the offline APIs:
+Our [OpenAI-Compatible Server][serving-openai-compatible-server] provides endpoints that correspond to the offline APIs:
 
 - [Completions API][completions-api] is similar to `LLM.generate` but only accepts text.
 - [Chat API][chat-api]  is similar to `LLM.chat`, accepting both text and [multi-modal inputs][multimodal-inputs] for models with a chat template.
diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index 89a128915a76c..693212e64bd27 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -113,7 +113,7 @@ A code example can be found here: <gh-file:examples/offline_inference/basic/scor
 
 ## Online Serving
 
-Our [OpenAI-Compatible Server][openai-compatible-server] provides endpoints that correspond to the offline APIs:
+Our [OpenAI-Compatible Server][serving-openai-compatible-server] provides endpoints that correspond to the offline APIs:
 
 - [Pooling API][pooling-api] is similar to `LLM.encode`, being applicable to all types of pooling models.
 - [Embeddings API][embeddings-api] is similar to `LLM.embed`, accepting both text and [multi-modal inputs][multimodal-inputs] for embedding models.
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index d2c22aef16f74..a994dd53fe3dc 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -34,7 +34,7 @@ llm.apply_model(lambda model: print(type(model)))
 If it is `TransformersForCausalLM` then it means it's based on Transformers!
 
 !!! tip
-    You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for [offline-inference][offline-inference] or `--model-impl transformers` for the [openai-compatible-server][openai-compatible-server].
+    You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for [offline-inference][offline-inference] or `--model-impl transformers` for the [openai-compatible-server][serving-openai-compatible-server].
 
 !!! note
     vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM.
@@ -53,8 +53,8 @@ For a model to be compatible with the Transformers backend for vLLM it must:
 
 If the compatible model is:
 
-- on the Hugging Face Model Hub, simply set `trust_remote_code=True` for [offline-inference][offline-inference] or `--trust-remote-code` for the [openai-compatible-server][openai-compatible-server].
-- in a local directory, simply pass directory path to `model=<MODEL_DIR>` for [offline-inference][offline-inference] or `vllm serve <MODEL_DIR>` for the [openai-compatible-server][openai-compatible-server].
+- on the Hugging Face Model Hub, simply set `trust_remote_code=True` for [offline-inference][offline-inference] or `--trust-remote-code` for the [openai-compatible-server][serving-openai-compatible-server].
+- in a local directory, simply pass directory path to `model=<MODEL_DIR>` for [offline-inference][offline-inference] or `vllm serve <MODEL_DIR>` for the [openai-compatible-server][serving-openai-compatible-server].
 
 This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
 
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index a3f1ef9fd8b6b..e9c0a902a070e 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -1,7 +1,7 @@
 ---
 title: OpenAI-Compatible Server
 ---
-[](){ #openai-compatible-server }
+[](){ #serving-openai-compatible-server }
 
 vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more! This functionality lets you serve models and interact with them using an HTTP client.
 

From b205e8467dc385441333670ae506bc257e9b80b0 Mon Sep 17 00:00:00 2001
From: QiliangCui <derrhein@gmail.com>
Date: Tue, 1 Jul 2025 23:33:20 -0700
Subject: [PATCH 037/167] [Doc][TPU] Add models and features supporting matrix.
 (#20230)

Signed-off-by: Qiliang Cui <cuiq@google.com>
---
 docs/.nav.yml                                |  1 +
 docs/features/compatibility_matrix.md        | 34 +++++++++---------
 docs/models/hardware_supported_models/tpu.md | 36 ++++++++++++++++++++
 3 files changed, 54 insertions(+), 17 deletions(-)
 create mode 100644 docs/models/hardware_supported_models/tpu.md

diff --git a/docs/.nav.yml b/docs/.nav.yml
index e679807f75346..06bfcc3f1effe 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -39,6 +39,7 @@ nav:
       - models/generative_models.md
       - models/pooling_models.md
       - models/extensions
+      - Hardware Supported Models: models/hardware_supported_models
     - Features:
       - features/compatibility_matrix.md
       - features/*
diff --git a/docs/features/compatibility_matrix.md b/docs/features/compatibility_matrix.md
index 5d448eb5c03d8..4f475ee4db834 100644
--- a/docs/features/compatibility_matrix.md
+++ b/docs/features/compatibility_matrix.md
@@ -59,23 +59,23 @@ th:not(:first-child) {
 
 ## Feature x Hardware
 
-| Feature                                                   | Volta              | Turing   | Ampere   | Ada   | Hopper   | CPU                | AMD   |
-|-----------------------------------------------------------|--------------------|----------|----------|-------|----------|--------------------|-------|
-| [CP][chunked-prefill]                                     | [❌](gh-issue:2729) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
-| [APC][automatic-prefix-caching]                           | [❌](gh-issue:3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
-| [LoRA][lora-adapter]                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
-| <abbr title="Prompt Adapter">prmpt adptr</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](gh-issue:8475) | ✅     |
-| [SD][spec-decode]                                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
-| CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     |
-| <abbr title="Pooling Models">pooling</abbr>               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❔     |
-| <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     |
-| <abbr title="Multimodal Inputs">mm</abbr>                 | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
-| <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
-| <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
-| <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     |
-| multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](gh-issue:8477) | ✅     |
-| best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
-| beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
+| Feature                                                   | Volta               | Turing    | Ampere    | Ada    | Hopper     | CPU                | AMD    | TPU |
+|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----|
+| [CP][chunked-prefill]                                     | [❌](gh-issue:2729) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ |
+| [APC][automatic-prefix-caching]                           | [❌](gh-issue:3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ |
+| [LoRA][lora-adapter]                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ |
+| <abbr title="Prompt Adapter">prmpt adptr</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](gh-issue:8475) | ✅     | ❌ |
+| [SD][spec-decode]                                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ❌ |
+| <abbr title="Pooling Models">pooling</abbr>               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❔     | ❌ |
+| <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ❌ |
+| <abbr title="Multimodal Inputs">mm</abbr>                 | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     | ❌ |
+| multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](gh-issue:8477) | ✅     | ❌ |
+| best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
 
 !!! note
     Please refer to [Feature support through NxD Inference backend][feature-support-through-nxd-inference-backend] for features supported on AWS Neuron hardware
diff --git a/docs/models/hardware_supported_models/tpu.md b/docs/models/hardware_supported_models/tpu.md
new file mode 100644
index 0000000000000..dca5e20cb3431
--- /dev/null
+++ b/docs/models/hardware_supported_models/tpu.md
@@ -0,0 +1,36 @@
+---
+title: TPU
+---
+[](){ #tpu-supported-models }
+
+# TPU Supported Models
+## Text-only Language Models
+
+| Model                                               | Architecture                   | Supported |
+|-----------------------------------------------------|--------------------------------|-----------|
+| mistralai/Mixtral-8x7B-Instruct-v0.1                | MixtralForCausalLM             | 🟨 |
+| mistralai/Mistral-Small-24B-Instruct-2501           | MistralForCausalLM             | ✅ |
+| mistralai/Codestral-22B-v0.1                        | MistralForCausalLM             | ✅ |
+| mistralai/Mixtral-8x22B-Instruct-v0.1               | MixtralForCausalLM             | ❌ |
+| meta-llama/Llama-3.3-70B-Instruct                   | LlamaForCausalLM               | ✅ |
+| meta-llama/Llama-3.1-8B-Instruct                    | LlamaForCausalLM               | ✅ |
+| meta-llama/Llama-3.1-70B-Instruct                   | LlamaForCausalLM               | ✅ |
+| meta-llama/Llama-4-*                                | Llama4ForConditionalGeneration | ❌ |
+| microsoft/Phi-3-mini-128k-instruct                  | Phi3ForCausalLM                | 🟨 |
+| microsoft/phi-4                                     | Phi3ForCausalLM                | ❌ |
+| google/gemma-3-27b-it                               | Gemma3ForConditionalGeneration | 🟨 |
+| google/gemma-3-4b-it                                | Gemma3ForConditionalGeneration | ❌ |
+| deepseek-ai/DeepSeek-R1                             | DeepseekV3ForCausalLM          | ❌ |
+| deepseek-ai/DeepSeek-V3                             | DeepseekV3ForCausalLM          | ❌ |
+| RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8  | LlamaForCausalLM               | ✅ |
+| RedHatAI/Meta-Llama-3.1-70B-Instruct-quantized.w8a8 | LlamaForCausalLM               | ✅ |
+| Qwen/Qwen3-8B                                       | Qwen3ForCausalLM               | ✅ |
+| Qwen/Qwen3-32B                                      | Qwen3ForCausalLM               | ✅ |
+| Qwen/Qwen2.5-7B-Instruct                            | Qwen2ForCausalLM               | ✅ |
+| Qwen/Qwen2.5-32B                                    | Qwen2ForCausalLM               | ✅ |
+| Qwen/Qwen2.5-14B-Instruct                           | Qwen2ForCausalLM               | ✅ |
+| Qwen/Qwen2.5-1.5B-Instruct                          | Qwen2ForCausalLM               | 🟨 |
+
+✅ Runs and optimized.  
+🟨 Runs and correct but not optimized to green yet.  
+❌ Does not pass accuracy test or does not run.  

From 7da296be04933cfc29031f5bd1ba7cd28f376faa Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Tue, 1 Jul 2025 23:33:37 -0700
Subject: [PATCH 038/167] [TPU] kv cache update kernel supports dynamic grid
 (#20235)

Signed-off-by: Chengji Yao <chengjiyao@google.com>
---
 tests/v1/tpu/test_kv_cache_update_kernel.py  |  8 +++--
 vllm/attention/ops/pallas_kv_cache_update.py |  9 ++++--
 vllm/v1/attention/backends/pallas.py         | 34 +++++++++++++-------
 vllm/v1/worker/tpu_model_runner.py           |  8 +++++
 4 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/tests/v1/tpu/test_kv_cache_update_kernel.py b/tests/v1/tpu/test_kv_cache_update_kernel.py
index 63a1f6777e4df..f82737325e9b8 100644
--- a/tests/v1/tpu/test_kv_cache_update_kernel.py
+++ b/tests/v1/tpu/test_kv_cache_update_kernel.py
@@ -32,6 +32,7 @@ def test_kv_cache_update_kernel(page_size: int, combined_kv_head_num: int,
     new_kv_xla = new_kv_cpu.to(torch_xla.device())
     slice_lens = np.array([7, page_size, page_size, 1, 1, 1, 9],
                           dtype=np.int32)
+    num_kv_update_slices = len(slice_lens)
     kv_cache_start_indices = np.array([
         page_size * 2 - 7, page_size * 2, page_size * 3, page_size * 4 + 6,
         page_size * 5 + 7, page_size * 6 + 8, page_size * 15 + 3
@@ -52,12 +53,15 @@ def test_kv_cache_update_kernel(page_size: int, combined_kv_head_num: int,
                                     device="cpu",
                                     dtype=torch.int32)
     slot_mapping_xla = slot_mapping_cpu.to(torch_xla.device())
+    num_kv_update_slices_xla = torch.tensor([num_kv_update_slices],
+                                            device=torch_xla.device(),
+                                            dtype=torch.int32)
     torch_xla.sync()
 
     torch.ops.xla.dynamo_set_buffer_donor_(kv_cache_xla, True)
     new_kv_cache_xla = torch.ops.xla.kv_cache_update_op(
-        new_kv_xla, slot_mapping_xla, kv_cache_xla, page_size,
-        num_slices_per_block)
+        new_kv_xla, slot_mapping_xla, kv_cache_xla, num_kv_update_slices_xla,
+        page_size, num_slices_per_block)
     kv_cache_xla.copy_(new_kv_cache_xla)
     torch_xla.sync()
 
diff --git a/vllm/attention/ops/pallas_kv_cache_update.py b/vllm/attention/ops/pallas_kv_cache_update.py
index 1a92b10e4f9c7..e7d727a45e91c 100644
--- a/vllm/attention/ops/pallas_kv_cache_update.py
+++ b/vllm/attention/ops/pallas_kv_cache_update.py
@@ -7,11 +7,13 @@ import jax
 from jax.experimental import pallas as pl
 from jax.experimental.pallas import tpu as pltpu
 
+from vllm.utils import cdiv
+
 
 def _kv_cache_update_kernel(
     # Prefetch
-    slices_ref,  # [3, num_slices], list of (kv_cache_start, new_kv_start,
-    # slice_len)
+    slices_ref,  # [3, padded_num_slices], list of (kv_cache_start,
+    # new_kv_start, slice_len)
     # Input
     new_kv_hbm_ref,  # [num_tokens, num_combined_kv_heads, head_dim]
     kv_cache_hbm_ref,  # [total_num_pages * page_size, num_combined_kv_heads,
@@ -70,6 +72,7 @@ def kv_cache_update(
     Array,  # [3, slices], list of (kv_cache_start, new_kv_start, slice_len)
     kv_cache: jax.
     Array,  # [total_num_pages * page_size, num_combined_kv_heads, head_dim]
+    num_kv_update_slices: jax.Array,  # [1]
     *,
     page_size: int = 32,
     num_slices_per_block: int = 8,
@@ -107,7 +110,7 @@ def kv_cache_update(
             num_scalar_prefetch=len(scalar_prefetches),
             in_specs=in_specs,
             out_specs=out_specs,
-            grid=(slices.shape[1] // num_slices_per_block, ),
+            grid=(cdiv(num_kv_update_slices[0], num_slices_per_block), ),
             scratch_shapes=scratch_shapes,
         ),
         out_shape=out_shape,
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index 49f0772c62d13..253d79d925cef 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -111,6 +111,7 @@ class PallasMetadata:
     context_lens: torch.Tensor
     query_start_loc: torch.Tensor
     num_seqs: torch.Tensor
+    num_kv_update_slices: torch.Tensor
     num_slices_per_kv_cache_update_block: int
 
 
@@ -219,7 +220,8 @@ class PallasAttentionBackendImpl(AttentionImpl):
             slot_mapping = attn_metadata.slot_mapping
             write_to_kv_cache(
                 key, value, kv_cache, slot_mapping,
-                attn_metadata.num_slices_per_kv_cache_update_block)
+                attn_metadata.num_slices_per_kv_cache_update_block,
+                attn_metadata.num_kv_update_slices)
 
         output = torch.ops.xla.ragged_paged_attention(
             query,
@@ -252,6 +254,7 @@ def write_to_kv_cache(
     kv_cache: torch.Tensor,
     slot_mapping: torch.Tensor,
     num_slices_per_kv_cache_update_block: int,
+    num_kv_update_slices: torch.Tensor,
 ) -> None:
     """ Write the key and values to the KV cache.
 
@@ -271,7 +274,7 @@ def write_to_kv_cache(
 
     kv_cache = kv_cache.flatten(0, 1)
     new_kv_cache = torch.ops.xla.kv_cache_update_op(
-        kv, slot_mapping, kv_cache, page_size,
+        kv, slot_mapping, kv_cache, num_kv_update_slices, page_size,
         num_slices_per_kv_cache_update_block)
     # NOTE: the in-place copy will be optimized away by XLA compiler.
     kv_cache.copy_(new_kv_cache)
@@ -279,32 +282,39 @@ def write_to_kv_cache(
 
 @requires_jax
 def kv_cache_update_op_impl(kv: torch.Tensor, slot_mapping: torch.Tensor,
-                            kv_cache: torch.Tensor, page_size: int,
+                            kv_cache: torch.Tensor,
+                            num_kv_update_slices: torch.Tensor, page_size: int,
                             num_slices_per_block: int):
     from vllm.attention.ops.pallas_kv_cache_update import kv_cache_update
-    new_kv_cache = xb.call_jax(kv_cache_update, (kv, slot_mapping, kv_cache), {
-        "page_size": page_size,
-        "num_slices_per_block": num_slices_per_block
-    })
+    new_kv_cache = xb.call_jax(
+        kv_cache_update, (kv, slot_mapping, kv_cache, num_kv_update_slices), {
+            "page_size": page_size,
+            "num_slices_per_block": num_slices_per_block
+        })
     return new_kv_cache
 
 
 XLA_LIB.define(
-    "kv_cache_update_op(Tensor kv, Tensor slot_mapping, Tensor kv_cache, "
-    "int page_size, int num_slices_per_block) -> Tensor", )
+    "kv_cache_update_op(Tensor kv, Tensor slot_mapping, Tensor kv_cache," \
+    "Tensor num_kv_update_slices, int page_size, int num_slices_per_block)" \
+    "-> Tensor", )
 
 
 @impl(XLA_LIB, "kv_cache_update_op", "XLA")
 def kv_cache_update_op_xla(kv: torch.Tensor, slot_mapping: torch.Tensor,
-                           kv_cache: torch.Tensor, page_size: int,
+                           kv_cache: torch.Tensor,
+                           num_kv_update_slices: torch.Tensor, page_size: int,
                            num_slices_per_block: int) -> torch.Tensor:
     new_kv_cache = kv_cache_update_op_impl(kv, slot_mapping, kv_cache,
-                                           page_size, num_slices_per_block)
+                                           num_kv_update_slices, page_size,
+                                           num_slices_per_block)
     return new_kv_cache
 
 
 @impl(XLA_LIB, "kv_cache_update_op", "CompositeExplicitAutograd")
 def kv_cache_update_op_non_xla(kv: torch.Tensor, slot_mapping: torch.Tensor,
-                               kv_cache: torch.Tensor, page_size: int,
+                               kv_cache: torch.Tensor,
+                               num_kv_update_slices: torch.Tensor,
+                               page_size: int,
                                num_slices_per_block: int) -> torch.Tensor:
     return kv_cache
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 0cc218bdb646f..f5f26d8fff98a 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -713,8 +713,10 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                 self.device)
         block_tables = block_tables.to(self.device)
 
+        # Calculate the slot mapping
         slot_mapping_metadata = self._get_slot_mapping_metadata(
             num_reqs, num_scheduled_tokens_per_req)
+        num_kv_update_slices = slot_mapping_metadata.shape[0]
         padded_num_slices = _get_padded_num_kv_cache_update_slices(
             padded_total_num_scheduled_tokens, self.max_num_reqs,
             self.block_size)
@@ -745,6 +747,9 @@ class TPUModelRunner(LoRAModelRunnerMixin):
             num_seqs=torch.tensor([num_reqs],
                                   dtype=torch.int32,
                                   device=self.device),
+            num_kv_update_slices=torch.tensor([num_kv_update_slices],
+                                              dtype=torch.int32,
+                                              device=self.device),
             num_slices_per_kv_cache_update_block=
             NUM_SLICES_PER_KV_CACHE_UPDATE_BLOCK,
         )
@@ -1174,6 +1179,8 @@ class TPUModelRunner(LoRAModelRunnerMixin):
                                    dtype=torch.int32).to(self.device)
         padded_num_slices = _get_padded_num_kv_cache_update_slices(
             num_tokens, self.max_num_reqs, self.block_size)
+        num_kv_update_slices = torch.tensor([padded_num_slices],
+                                            dtype=torch.int32).to(self.device)
         slot_mapping = torch.zeros((3, padded_num_slices),
                                    dtype=torch.int32).to(self.device)
         block_tables = torch.zeros((num_reqs, num_blocks),
@@ -1193,6 +1200,7 @@ class TPUModelRunner(LoRAModelRunnerMixin):
             context_lens=context_lens,
             query_start_loc=query_start_loc,
             num_seqs=num_seqs,
+            num_kv_update_slices=num_kv_update_slices,
             num_slices_per_kv_cache_update_block=
             NUM_SLICES_PER_KV_CACHE_UPDATE_BLOCK,
         )

From 2e7cbf2d7da1030d141dd32e0271697b49b84e54 Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Tue, 1 Jul 2025 23:34:03 -0700
Subject: [PATCH 039/167] [Frontend] Support configurable mm placeholder
 strings & flexible video sampling policies via CLI flags. (#20105)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
---
 tests/async_engine/test_async_llm_engine.py   |  6 ++-
 tests/engine/test_arg_utils.py                | 52 +++++++++++++++++++
 tests/entrypoints/openai/test_serving_chat.py |  6 ++-
 tests/multimodal/test_utils.py                | 12 ++---
 tests/multimodal/test_video.py                | 48 ++++++++++++++++-
 vllm/config.py                                | 16 ++++++
 vllm/engine/arg_utils.py                      | 12 +++++
 vllm/entrypoints/chat_utils.py                | 11 ++--
 vllm/multimodal/audio.py                      | 10 ++++
 vllm/multimodal/image.py                      |  8 ++-
 vllm/multimodal/utils.py                      | 27 ++++++----
 vllm/multimodal/video.py                      | 20 +++++--
 12 files changed, 199 insertions(+), 29 deletions(-)

diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
index 043b75cc5d385..3c030aea20661 100644
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -6,8 +6,8 @@ import os
 import uuid
 from asyncio import CancelledError
 from copy import copy
-from dataclasses import dataclass
-from typing import Optional
+from dataclasses import dataclass, field
+from typing import Any, Optional
 
 import pytest
 import pytest_asyncio
@@ -32,6 +32,8 @@ class RequestOutput:
 @dataclass
 class MockModelConfig:
     use_async_output_proc = True
+    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
+    mm_placeholder_str_override: dict[str, str] = field(default_factory=dict)
 
 
 class MockEngine:
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 847f150bd6443..8e0579b7cfb84 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -231,6 +231,58 @@ def test_limit_mm_per_prompt_parser(arg, expected):
     assert args.limit_mm_per_prompt == expected
 
 
+@pytest.mark.parametrize(
+    ("arg", "expected"),
+    [
+        (None, dict()),
+        ('{"video": {"num_frames": 123} }', {
+            "video": {
+                "num_frames": 123
+            }
+        }),
+        (
+            '{"video": {"num_frames": 123, "fps": 1.0, "foo": "bar"}, "image": {"foo": "bar"} }',  # noqa
+            {
+                "video": {
+                    "num_frames": 123,
+                    "fps": 1.0,
+                    "foo": "bar"
+                },
+                "image": {
+                    "foo": "bar"
+                }
+            }),
+    ])
+def test_media_io_kwargs_parser(arg, expected):
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    if arg is None:
+        args = parser.parse_args([])
+    else:
+        args = parser.parse_args(["--media-io-kwargs", arg])
+
+    assert args.media_io_kwargs == expected
+
+
+@pytest.mark.parametrize(("arg", "expected"), [
+    (None, dict()),
+    ('{"video":"<|video_placeholder|>"}', {
+        "video": "<|video_placeholder|>"
+    }),
+    ('{"video":"<|video_placeholder|>", "image": "<|image_placeholder|>"}', {
+        "video": "<|video_placeholder|>",
+        "image": "<|image_placeholder|>"
+    }),
+])
+def test_mm_placeholder_str_override_parser(arg, expected):
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    if arg is None:
+        args = parser.parse_args([])
+    else:
+        args = parser.parse_args(["--mm-placeholder-str-override", arg])
+
+    assert args.mm_placeholder_str_override == expected
+
+
 def test_compilation_config():
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
 
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 94740fefc870e..e339351189218 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -3,8 +3,8 @@
 
 import asyncio
 from contextlib import suppress
-from dataclasses import dataclass
-from typing import Optional
+from dataclasses import dataclass, field
+from typing import Any, Optional
 from unittest.mock import MagicMock
 
 from vllm.config import MultiModalConfig
@@ -40,6 +40,8 @@ class MockModelConfig:
     allowed_local_media_path: str = ""
     encoder_config = None
     generation_config: str = "auto"
+    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
+    mm_placeholder_str_override: dict[str, str] = field(default_factory=dict)
 
     def get_diff_sampling_param(self):
         return self.diff_sampling_param or {}
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index a48542cec3f87..d927ae5cd0b27 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -167,14 +167,14 @@ async def test_fetch_image_error_conversion():
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
 @pytest.mark.parametrize("num_frames", [-1, 32, 1800])
 async def test_fetch_video_http(video_url: str, num_frames: int):
-    connector = MediaConnector()
+    connector = MediaConnector(
+        media_io_kwargs={"video": {
+            "num_frames": num_frames,
+        }})
 
-    video_sync = connector.fetch_video(video_url, num_frames=num_frames)
-    video_async = await connector.fetch_video_async(video_url,
-                                                    num_frames=num_frames)
-    # Check that the video frames are equal and metadata are same
+    video_sync = connector.fetch_video(video_url)
+    video_async = await connector.fetch_video_async(video_url)
     assert np.array_equal(video_sync[0], video_async[0])
-    assert video_sync[1] == video_async[1]
 
 
 # Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py
index 9a700808d9d8a..897c9c33461ac 100644
--- a/tests/multimodal/test_video.py
+++ b/tests/multimodal/test_video.py
@@ -4,7 +4,10 @@ import numpy as np
 import numpy.typing as npt
 import pytest
 
-from vllm.multimodal.video import VIDEO_LOADER_REGISTRY, VideoLoader
+from vllm import envs
+from vllm.multimodal.image import ImageMediaIO
+from vllm.multimodal.video import (VIDEO_LOADER_REGISTRY, VideoLoader,
+                                   VideoMediaIO)
 
 NUM_FRAMES = 10
 FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
@@ -40,3 +43,46 @@ def test_video_loader_registry():
 def test_video_loader_type_doesnt_exist():
     with pytest.raises(AssertionError):
         VIDEO_LOADER_REGISTRY.load("non_existing_video_loader")
+
+
+@VIDEO_LOADER_REGISTRY.register("assert_10_frames_1_fps")
+class Assert10Frames1FPSVideoLoader(VideoLoader):
+
+    @classmethod
+    def load_bytes(cls,
+                   data: bytes,
+                   num_frames: int = -1,
+                   fps: float = -1.0,
+                   **kwargs) -> npt.NDArray:
+        assert num_frames == 10, "bad num_frames"
+        assert fps == 1.0, "bad fps"
+        return FAKE_OUTPUT_2
+
+
+def test_video_media_io_kwargs():
+    envs.VLLM_VIDEO_LOADER_BACKEND = "assert_10_frames_1_fps"
+    imageio = ImageMediaIO()
+
+    # Verify that different args pass/fail assertions as expected.
+    videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 1.0})
+    _ = videoio.load_bytes(b"test")
+
+    videoio = VideoMediaIO(
+        imageio, **{
+            "num_frames": 10,
+            "fps": 1.0,
+            "not_used": "not_used"
+        })
+    _ = videoio.load_bytes(b"test")
+
+    with pytest.raises(AssertionError, match="bad num_frames"):
+        videoio = VideoMediaIO(imageio, **{})
+        _ = videoio.load_bytes(b"test")
+
+    with pytest.raises(AssertionError, match="bad num_frames"):
+        videoio = VideoMediaIO(imageio, **{"num_frames": 9, "fps": 1.0})
+        _ = videoio.load_bytes(b"test")
+
+    with pytest.raises(AssertionError, match="bad fps"):
+        videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 2.0})
+        _ = videoio.load_bytes(b"test")
diff --git a/vllm/config.py b/vllm/config.py
index 6412e6e293b45..74e7ed2d48747 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -346,6 +346,12 @@ class ModelConfig:
     limit_mm_per_prompt: dict[str, int] = field(default_factory=dict)
     """Maximum number of data items per modality per prompt. Only applicable
     for multimodal models."""
+    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
+    """Additional args passed to process media inputs, keyed by modalities. 
+    For example, to set num_frames for video, set 
+    `--media-io-kwargs '{"video": {"num_frames": 40} }'` """
+    mm_placeholder_str_override: dict[str, str] = field(default_factory=dict)
+    """Optionally override placeholder string for given modalities."""
     use_async_output_proc: bool = True
     """Whether to use async output processor."""
     config_format: Union[str, ConfigFormat] = ConfigFormat.AUTO.value
@@ -694,6 +700,8 @@ class ModelConfig:
         if self.registry.is_multimodal_model(self.architectures):
             return MultiModalConfig(
                 limit_per_prompt=self.limit_mm_per_prompt,
+                media_io_kwargs=self.media_io_kwargs,
+                mm_placeholder_str_override=self.mm_placeholder_str_override,
                 mm_processor_kwargs=self.mm_processor_kwargs,
                 disable_mm_preprocessor_cache=self.
                 disable_mm_preprocessor_cache)
@@ -3063,6 +3071,14 @@ class MultiModalConfig:
     `{"images": 16, "videos": 2}`
     """
 
+    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
+    """Additional args passed to process media inputs, keyed by modalities. 
+    For example, to set num_frames for video, set 
+    `--media-io-kwargs '{"video": {"num_frames": 40} }'` """
+
+    mm_placeholder_str_override: dict[str, str] = field(default_factory=dict)
+    """Optionally override placeholder string for given modalities."""
+
     mm_processor_kwargs: Optional[dict[str, object]] = None
     """
     Overrides for the multi-modal processor obtained from
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 2d3783363c00b..38f82e64de537 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -369,6 +369,11 @@ class EngineArgs:
         get_field(TokenizerPoolConfig, "extra_config")
     limit_mm_per_prompt: dict[str, int] = \
         get_field(MultiModalConfig, "limit_per_prompt")
+    media_io_kwargs: dict[str, dict[str,
+                                    Any]] = get_field(MultiModalConfig,
+                                                      "media_io_kwargs")
+    mm_placeholder_str_override: dict[str, str] = \
+        get_field(MultiModalConfig, "mm_placeholder_str_override")
     mm_processor_kwargs: Optional[Dict[str, Any]] = \
         MultiModalConfig.mm_processor_kwargs
     disable_mm_preprocessor_cache: bool = \
@@ -745,6 +750,11 @@ class EngineArgs:
         )
         multimodal_group.add_argument("--limit-mm-per-prompt",
                                       **multimodal_kwargs["limit_per_prompt"])
+        multimodal_group.add_argument("--media-io-kwargs",
+                                      **multimodal_kwargs["media_io_kwargs"])
+        multimodal_group.add_argument(
+            "--mm-placeholder-str-override",
+            **multimodal_kwargs["mm_placeholder_str_override"])
         multimodal_group.add_argument(
             "--mm-processor-kwargs",
             **multimodal_kwargs["mm_processor_kwargs"])
@@ -969,6 +979,8 @@ class EngineArgs:
             enable_prompt_embeds=self.enable_prompt_embeds,
             served_model_name=self.served_model_name,
             limit_mm_per_prompt=self.limit_mm_per_prompt,
+            media_io_kwargs=self.media_io_kwargs,
+            mm_placeholder_str_override=self.mm_placeholder_str_override,
             use_async_output_proc=not self.disable_async_output_proc,
             config_format=self.config_format,
             mm_processor_kwargs=self.mm_processor_kwargs,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 45f1894d022b3..093675f830e01 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -507,6 +507,9 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
 
     def _placeholder_str(self, modality: ModalityStr,
                          current_count: int) -> Optional[str]:
+        if modality in self._model_config.mm_placeholder_str_override:
+            return self._model_config.mm_placeholder_str_override[modality]
+
         # TODO: Let user specify how to insert image tokens into prompt
         # (similar to chat template)
         hf_config = self._model_config.hf_config
@@ -725,6 +728,7 @@ class MultiModalContentParser(BaseMultiModalContentParser):
         self._tracker = tracker
 
         self._connector = MediaConnector(
+            media_io_kwargs=self._tracker._model_config.media_io_kwargs,
             allowed_local_media_path=tracker.allowed_local_media_path,
         )
 
@@ -763,7 +767,7 @@ class MultiModalContentParser(BaseMultiModalContentParser):
         return self.parse_audio(audio_url)
 
     def parse_video(self, video_url: str) -> None:
-        video = self._connector.fetch_video(video_url)
+        video = self._connector.fetch_video(video_url=video_url)
 
         placeholder = self._tracker.add("video", video)
         self._add_placeholder(placeholder)
@@ -776,7 +780,8 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
 
         self._tracker = tracker
         self._connector = MediaConnector(
-            allowed_local_media_path=tracker.allowed_local_media_path,
+            media_io_kwargs=self._tracker._model_config.media_io_kwargs,
+            allowed_local_media_path=tracker.allowed_local_media_path
         )
 
     def parse_image(self, image_url: str) -> None:
@@ -818,7 +823,7 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
         return self.parse_audio(audio_url)
 
     def parse_video(self, video_url: str) -> None:
-        video = self._connector.fetch_video_async(video_url)
+        video = self._connector.fetch_video_async(video_url=video_url)
 
         placeholder = self._tracker.add("video", video)
         self._add_placeholder(placeholder)
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index fbb29276f6bdf..f3b273eb41e8f 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -83,6 +83,16 @@ class AudioResampler:
 
 class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
 
+    def __init__(self, **kwargs) -> None:
+        super().__init__()
+
+        # `kwargs` contains custom arguments from
+        # --media-io-kwargs for this modality.
+        # They can be passed to the underlying
+        # media loaders (e.g. custom implementations)
+        # for flexible control.
+        self.kwargs = kwargs
+
     def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
         return librosa.load(BytesIO(data), sr=None)
 
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index dce4c4c1cadba..a0448a80ac7c2 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -44,10 +44,16 @@ def convert_image_mode(image: Image.Image, to_mode: str):
 
 class ImageMediaIO(MediaIO[Image.Image]):
 
-    def __init__(self, *, image_mode: str = "RGB") -> None:
+    def __init__(self, image_mode: str = "RGB", **kwargs) -> None:
         super().__init__()
 
         self.image_mode = image_mode
+        # `kwargs` contains custom arguments from
+        # --media-io-kwargs for this modality.
+        # They can be passed to the underlying
+        # media loaders (e.g. custom implementations)
+        # for flexible control.
+        self.kwargs = kwargs
 
     def load_bytes(self, data: bytes) -> Image.Image:
         image = Image.open(BytesIO(data))
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 5e61d460fa428..2f2be59a1f42d 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -38,12 +38,15 @@ class MediaConnector:
 
     def __init__(
         self,
+        media_io_kwargs: Optional[dict[str, dict[str, Any]]] = None,
         connection: HTTPConnection = global_http_connection,
         *,
         allowed_local_media_path: str = "",
     ) -> None:
         super().__init__()
 
+        self.media_io_kwargs: dict[str, dict[
+            str, Any]] = media_io_kwargs if media_io_kwargs else {}
         self.connection = connection
 
         if allowed_local_media_path:
@@ -149,7 +152,7 @@ class MediaConnector:
         """
         Load audio from a URL.
         """
-        audio_io = AudioMediaIO()
+        audio_io = AudioMediaIO(**self.media_io_kwargs.get("audio", {}))
 
         return self.load_from_url(
             audio_url,
@@ -164,7 +167,7 @@ class MediaConnector:
         """
         Asynchronously fetch audio from a URL.
         """
-        audio_io = AudioMediaIO()
+        audio_io = AudioMediaIO(**self.media_io_kwargs.get("audio", {}))
 
         return await self.load_from_url_async(
             audio_url,
@@ -183,7 +186,8 @@ class MediaConnector:
 
         By default, the image is converted into RGB format.
         """
-        image_io = ImageMediaIO(image_mode=image_mode)
+        image_io = ImageMediaIO(image_mode=image_mode,
+                                **self.media_io_kwargs.get("image", {}))
 
         try:
             return self.load_from_url(
@@ -206,7 +210,8 @@ class MediaConnector:
 
         By default, the image is converted into RGB format.
         """
-        image_io = ImageMediaIO(image_mode=image_mode)
+        image_io = ImageMediaIO(image_mode=image_mode,
+                                **self.media_io_kwargs.get("image", {}))
 
         try:
             return await self.load_from_url_async(
@@ -223,13 +228,14 @@ class MediaConnector:
         video_url: str,
         *,
         image_mode: str = "RGB",
-        num_frames: int = 32,
     ) -> npt.NDArray:
         """
         Load video from a HTTP or base64 data URL.
         """
-        image_io = ImageMediaIO(image_mode=image_mode)
-        video_io = VideoMediaIO(image_io, num_frames=num_frames)
+        image_io = ImageMediaIO(image_mode=image_mode,
+                                **self.media_io_kwargs.get("image", {}))
+        video_io = VideoMediaIO(image_io,
+                                **self.media_io_kwargs.get("video", {}))
 
         return self.load_from_url(
             video_url,
@@ -242,15 +248,16 @@ class MediaConnector:
         video_url: str,
         *,
         image_mode: str = "RGB",
-        num_frames: int = 32,
     ) -> npt.NDArray:
         """
         Asynchronously load video from a HTTP or base64 data URL.
 
         By default, the image is converted into RGB format.
         """
-        image_io = ImageMediaIO(image_mode=image_mode)
-        video_io = VideoMediaIO(image_io, num_frames=num_frames)
+        image_io = ImageMediaIO(image_mode=image_mode,
+                                **self.media_io_kwargs.get("image", {}))
+        video_io = VideoMediaIO(image_io,
+                                **self.media_io_kwargs.get("video", {}))
 
         return await self.load_from_url_async(
             video_url,
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index f15264cbde0b2..d9589068a203b 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -54,7 +54,10 @@ class VideoLoader:
 
     @classmethod
     @abstractmethod
-    def load_bytes(cls, data: bytes, num_frames: int = -1) -> npt.NDArray:
+    def load_bytes(cls,
+                   data: bytes,
+                   num_frames: int = -1,
+                   **kwargs) -> npt.NDArray:
         raise NotImplementedError
 
 
@@ -102,7 +105,8 @@ class OpenCVVideoBackend(VideoLoader):
     @classmethod
     def load_bytes(cls,
                    data: bytes,
-                   num_frames: int = -1) -> tuple[npt.NDArray, dict]:
+                   num_frames: int = -1,
+                   **kwargs) -> npt.NDArray:
         import cv2
 
         backend = cls().get_cv2_video_api()
@@ -159,18 +163,26 @@ class VideoMediaIO(MediaIO[npt.NDArray]):
     def __init__(
         self,
         image_io: ImageMediaIO,
-        *,
         num_frames: int = 32,
+        **kwargs,
     ) -> None:
         super().__init__()
 
         self.image_io = image_io
         self.num_frames = num_frames
+        # `kwargs` contains custom arguments from
+        # --media-io-kwargs for this modality.
+        # They can be passed to the underlying
+        # media loaders (e.g. custom implementations)
+        # for flexible control.
+        self.kwargs = kwargs
         video_loader_backend = envs.VLLM_VIDEO_LOADER_BACKEND
         self.video_loader = VIDEO_LOADER_REGISTRY.load(video_loader_backend)
 
     def load_bytes(self, data: bytes) -> npt.NDArray:
-        return self.video_loader.load_bytes(data, self.num_frames)
+        return self.video_loader.load_bytes(data,
+                                            num_frames=self.num_frames,
+                                            **self.kwargs)
 
     def load_base64(self, media_type: str, data: str) -> npt.NDArray:
         if media_type.lower() == "video/jpeg":

From 8452946c06a3b8a76233d2b390d886a5a8c78182 Mon Sep 17 00:00:00 2001
From: Kwai-Keye <Keye@kuaishou.com>
Date: Wed, 2 Jul 2025 14:35:04 +0800
Subject: [PATCH 040/167] [Model][VLM] Support Keye-VL-8B-Preview (#20126)

Signed-off-by: Kwai-Keye <Keye@kuaishou.com>
---
 docs/models/supported_models.md               |    1 +
 examples/offline_inference/vision_language.py |   32 +
 .../vision_language_multi_image.py            |   38 +
 tests/models/registry.py                      |    2 +
 vllm/entrypoints/chat_utils.py                |    4 +-
 vllm/model_executor/models/keye.py            | 1725 +++++++++++++++++
 vllm/model_executor/models/registry.py        |    1 +
 7 files changed, 1801 insertions(+), 2 deletions(-)
 create mode 100644 vllm/model_executor/models/keye.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index a994dd53fe3dc..4c279b795ade0 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -559,6 +559,7 @@ Specified using `--task generate`.
 | `H2OVLChatModel`                             | H2OVL                                                                    | T + I<sup>E+</sup>                                                    | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.                                                                                      |                        | ✅︎                          | ✅︎\*                  |
 | `Idefics3ForConditionalGeneration`           | Idefics3                                                                 | T + I                                                                 | `HuggingFaceM4/Idefics3-8B-Llama3` etc.                                                                                                                 | ✅︎                     |                             |  ✅︎                   |
 | `InternVLChatModel`                          | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>)                                 | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.  | ✅︎                     | ✅︎                          | ✅︎                    |
+| `KeyeForConditionalGeneration`               | Keye-VL-8B-Preview                                                       | T + I<sup>E+</sup> + V<sup>E+</sup>                                   | `Kwai-Keye/Keye-VL-8B-Preview`                                                                                                                          |                        |                             | ✅︎                    |
 | `KimiVLForConditionalGeneration`             | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking                               | T + I<sup>+</sup>                                                     | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking`                                                                                    |                        |                             | ✅︎                    |
 | `Llama4ForConditionalGeneration`             | Llama 4                                                                  | T + I<sup>+</sup>                                                     | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. |                        | ✅︎                          | ✅︎                    |
 | `LlavaForConditionalGeneration`              | LLaVA-1.5                                                                | T + I<sup>E+</sup>                                                    | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.                                                                        |                        | ✅︎                          | ✅︎                    |
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index b9e8bef26eb24..b136b14cd8ea3 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -429,6 +429,37 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# Keye-VL
+def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "Kwai-Keye/Keye-VL-8B-Preview"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        trust_remote_code=True,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    if modality == "image":
+        placeholder = "<|image_pad|>"
+    elif modality == "video":
+        placeholder = "<|video_pad|>"
+
+    prompts = [
+        (
+            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+            f"{question}<|im_end|>\n"
+            "<|im_start|>assistant\n"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # Kimi-VL
 def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1154,6 +1185,7 @@ model_example_map = {
     "h2ovl_chat": run_h2ovl,
     "idefics3": run_idefics3,
     "internvl_chat": run_internvl,
+    "keye_vl": run_keye_vl,
     "kimi_vl": run_kimi_vl,
     "llava": run_llava,
     "llava-next": run_llava_next,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index edddd429364dc..13af8e9041947 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -423,6 +423,43 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "Kwai-Keye/Keye-VL-8B-Preview"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        },
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    image_data = [fetch_image(url) for url in image_urls]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
+
+
 def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "moonshotai/Kimi-VL-A3B-Instruct"
 
@@ -862,6 +899,7 @@ model_example_map = {
     "h2ovl_chat": load_h2ovl,
     "idefics3": load_idefics3,
     "internvl_chat": load_internvl,
+    "keye_vl": load_keye_vl,
     "kimi_vl": load_kimi_vl,
     "llava": load_llava,
     "llava-next": load_llava_next,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index faae46d286b3e..c395e627ab2a5 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -351,6 +351,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                          trust_remote_code=True),
     "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
                                                         {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}),  # noqa: E501
+    "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
+                                                    trust_remote_code=True),
     "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
                                                       extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},  # noqa: E501
                                                       trust_remote_code=True,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 093675f830e01..bbf6518913391 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -540,7 +540,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
                 return "<image>"
             if model_type in ("mllama", "llama4"):
                 return "<|image|>"
-            if model_type in ("qwen2_vl", "qwen2_5_vl"):
+            if model_type in ("qwen2_vl", "qwen2_5_vl", "keye", "Keye"):
                 return "<|vision_start|><|image_pad|><|vision_end|>"
             if model_type == "qwen2_5_omni":
                 return "<|vision_start|><|IMAGE|><|vision_end|>"
@@ -570,7 +570,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
                 return "<video>"
             if model_type == "glm4v":
                 return "<|begin_of_video|><|video|><|end_of_video|>"
-            if model_type in ("qwen2_vl", "qwen2_5_vl"):
+            if model_type in ("qwen2_vl", "qwen2_5_vl", "keye", "Keye"):
                 return "<|vision_start|><|video_pad|><|vision_end|>"
             if model_type == "qwen2_5_omni":
                 return "<|vision_start|><|VIDEO|><|vision_end|>"
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
new file mode 100644
index 0000000000000..84996c44aed4d
--- /dev/null
+++ b/vllm/model_executor/models/keye.py
@@ -0,0 +1,1725 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from functools import partial
+from typing import Any, Literal, Optional, TypedDict, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange
+from transformers import PretrainedConfig
+from transformers.activations import GELUActivation
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.modeling_outputs import (BaseModelOutput,
+                                           BaseModelOutputWithPooling)
+from transformers.utils import torch_int
+
+from vllm.config import VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.gptq import GPTQConfig
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinConfig)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (ImageItem, ModalityData,
+                                    MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, VideoItem)
+from vllm.multimodal.parse import (DictEmbeddingItems, ImageSize,
+                                   ModalityDataItems, MultiModalDataItems,
+                                   MultiModalDataParser)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.platforms import _Backend
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import uses_mrope
+from vllm.transformers_utils.processor import (
+    cached_image_processor_from_config)
+
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
+from .siglip import SiglipMLP
+from .utils import (AutoWeightsLoader, WeightsMapper,
+                    init_vllm_registered_model, is_pp_missing_parameter,
+                    maybe_prefix, merge_multimodal_embeddings)
+from .vision import get_vit_attn_backend
+
+logger = init_logger(__name__)
+
+_MAX_FRAMES_PER_VIDEO = 16
+_MAX_IMAGE_SIZE = 9999999
+
+
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int = 28,
+    min_pixels: int = 28 * 28 * 130,
+    max_pixels: int = 28 * 28 * 1280,
+):
+    if height < factor:
+        logger.warning(
+            "smart_resize: height=%s < factor=%s, reset height=factor",
+            height,
+            factor,
+        )
+        width = round((width * factor) / height)
+        height = factor
+
+    if width < factor:
+        logger.warning(
+            "smart_resize: width=%s < factor=%s, reset width=factor",
+            width,
+            factor,
+        )
+        height = round((height * factor) / width)
+        width = factor
+
+    if max(height, width) / min(height, width) > 200:
+        raise ValueError("absolute aspect ratio must be smaller than 200, got "
+                         "{max(height, width) / min(height, width)}")
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+
+
+class KeyeImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+    """Shape:
+    `(num_patches, num_channels * patch_size * patch_size)`
+    """
+
+    image_grid_thw: torch.Tensor
+    """Shape: `(num_images, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+class KeyeImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    image_embeds: torch.Tensor
+    """Supported types:
+    - list[`torch.Tensor`]: A list of tensors holding all images' features.
+        Each tensor holds an image's features.
+    - `torch.Tensor`: A tensor holding all images' features
+        (concatenation of all images' feature tensors).
+    
+    Tensor shape: `(num_image_features, hidden_size)`
+    - `num_image_features` varies based on
+        the number and resolution of the images.
+    - `hidden_size` must match the hidden size of language model backbone.
+    """
+
+    image_grid_thw: torch.Tensor
+    """Shape: `(num_images, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+KeyeImageInputs = Union[KeyeImagePixelInputs, KeyeImageEmbeddingInputs]
+
+
+class KeyeVideoPixelInputs(TypedDict):
+    type: Literal["pixel_values_videos"]
+    pixel_values_videos: torch.Tensor
+    """Shape:
+    `(num_patches,
+      num_channels * temporal_patch_size * patch_size * patch_size)`
+    """
+
+    video_grid_thw: torch.Tensor
+    """Shape: `(num_videos, 3)`
+
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+class KeyeVideoEmbeddingInputs(TypedDict):
+    type: Literal["video_embeds"]
+    video_embeds: torch.Tensor
+    """Supported types:
+    - list[`torch.Tensor`]: A list of tensors holding all videos' features.
+        Each tensor holds an video's features.
+    - `torch.Tensor`: A tensor holding all videos' features
+        (concatenation of all videos' feature tensors).
+    
+    Tensor shape: `(num_image_features, hidden_size)`
+    - `num_image_features` varies based on 
+        the number and resolution of the videos.
+    - `hidden_size` must match the hidden size of language model backbone.
+    """
+
+    video_grid_thw: torch.Tensor
+    """Shape: `(num_videos, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+KeyeVideoInputs = Union[KeyeVideoPixelInputs, KeyeVideoEmbeddingInputs]
+
+
+class KeyeVisionEmbeddings(nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+        self.num_patches = (self.image_size // self.patch_size)**2
+        self.num_positions = self.num_patches
+        self.cache_position_embedding = dict()
+        self.cache_position_count = dict()
+        self.position_embedding = nn.Embedding(self.num_positions,
+                                               self.embed_dim)
+        self.packing_position_embedding = nn.Embedding(32768, self.embed_dim)
+
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_positions).expand((1, -1)),
+            persistent=False,
+        )
+
+    def interpolate_pos_encoding(
+        self,
+        embeddings: torch.Tensor,
+        height: int,
+        width: int,
+        is_after_patchify: bool = False,
+    ) -> torch.Tensor:
+
+        num_positions = self.position_embedding.weight.shape[0]
+
+        patch_pos_embed = self.position_embedding.weight.unsqueeze(0)
+
+        dim = embeddings.shape[-1]
+
+        if is_after_patchify:
+            new_height = height
+            new_width = width
+        else:
+            new_height = height // self.patch_size
+            new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions,
+                                                  sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bilinear",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return patch_pos_embed
+
+    def fetch_position_embedding_lfu_cache(self,
+                                           embeddings,
+                                           h,
+                                           w,
+                                           max_cache: int = 20):
+        grid = (h, w)
+        if grid in self.cache_position_embedding:
+            self.cache_position_count[grid] += 1
+            return self.cache_position_embedding[grid]
+
+        if len(self.cache_position_embedding) >= max_cache:
+            min_hit_grid = min(
+                self.cache_position_count,
+                key=self.cache_position_count.get,
+            )
+            self.cache_position_count.pop(min_hit_grid)
+            self.cache_position_embedding.pop(min_hit_grid)
+
+        position_embedding = self.interpolate_pos_encoding(
+            embeddings, h, w, True)
+        self.cache_position_count[grid] = 1
+        self.cache_position_embedding[grid] = position_embedding
+        return position_embedding
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        position_ids: Optional[torch.Tensor] = None,
+        image_grid_thw: Optional[list[Union[
+            tuple[int, int, int],
+            list[tuple[int, int, int]],
+        ]]] = None,
+        interpolate_pos_encoding=False,
+    ) -> torch.Tensor:
+        if pixel_values.dim() == 4:
+            pixel_values = pixel_values.unsqueeze(0)
+        if pixel_values.dim() == 5:
+            if position_ids is None:
+                raise ValueError(
+                    "position_ids cannot be None when pixel_values.dim() is 5."
+                )
+            (
+                batch_size,
+                squence_len,
+                channel,
+                height,
+                width,
+            ) = pixel_values.shape
+            target_dtype = self.patch_embedding.weight.dtype
+            pixel_values = rearrange(pixel_values, "b l c h w -> (b l) c h w")
+            patch_embeds = self.patch_embedding(
+                pixel_values.to(dtype=target_dtype))
+            embeddings = patch_embeds.flatten(-2).squeeze(-1)
+
+            if interpolate_pos_encoding and image_grid_thw is not None:
+                start = 0
+                tmp_embeddings = list()
+                for image_grid in image_grid_thw:
+                    t, h, w = image_grid
+                    end = start + t * h * w
+                    image_embeddings = embeddings[start:end, :]
+                    position_embedding = (self.interpolate_pos_encoding(
+                        image_embeddings, h, w, True).squeeze(0).repeat(t, 1))
+                    image_embeddings = image_embeddings + position_embedding
+                    tmp_embeddings.append(image_embeddings)
+                    start = end
+                embeddings = torch.concat(tmp_embeddings, dim=0).unsqueeze(0)
+            else:
+                embeddings = embeddings + self.packing_position_embedding(
+                    position_ids)
+            return embeddings
+        else:
+            raise ValueError("Unsupported pixel_values dimension:"
+                             f" {pixel_values.dim()}. Expected 4 or 5.")
+
+
+def apply_rotary_pos_emb_flashatt(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    cos = cos.chunk(2, dim=-1)[0].contiguous()
+    sin = sin.chunk(2, dim=-1)[0].contiguous()
+
+    from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
+
+    q_embed = apply_rotary_emb(q.float(), cos.float(), sin.float()).type_as(q)
+    k_embed = apply_rotary_emb(k.float(), cos.float(), sin.float()).type_as(k)
+    return q_embed, k_embed
+
+
+class KeyeSiglipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You
+    Need' paper."""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        hidden_size = config.hidden_size
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_attention_heads
+        if self.total_num_kv_heads >= tp_size:
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scale = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.out_proj = RowParallelLinear(
+            input_size=hidden_size,
+            output_size=hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+
+        # Detect attention implementation.
+        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+        if self.attn_backend not in {_Backend.FLASH_ATTN, _Backend.XFORMERS}:
+            raise RuntimeError(
+                f"Keye-VL does not support {self.attn_backend} backend now.")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        cu_seqlens: Optional[list[torch.Tensor]] = None,
+        rope_emb: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split(
+            [self.q_size, self.kv_size, self.kv_size],
+            dim=-1,
+        )
+
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        batch_size = q.shape[0]
+
+        if rope_emb is None:
+            q = q.view(*q.shape[:-1], self.num_heads, self.head_dim)
+            k = k.view(
+                *k.shape[:-1],
+                self.num_kv_heads,
+                self.head_dim,
+            )
+            v = v.view(
+                *v.shape[:-1],
+                self.num_kv_heads,
+                self.head_dim,
+            )
+        else:
+            if cu_seqlens is None:
+                raise ValueError(
+                    "cu_seqlens cannot be None when rope_emb is not None.")
+            cos, sin = rope_emb
+            q = q.view(*q.shape[:-1], self.num_heads, self.head_dim)
+            k = k.view(
+                *k.shape[:-1],
+                self.num_kv_heads,
+                self.head_dim,
+            )
+            q, k = apply_rotary_pos_emb_flashatt(q, k, cos, sin)
+            v = v.view(
+                *v.shape[:-1],
+                self.num_kv_heads,
+                self.head_dim,
+            )
+
+        if self.attn_backend == _Backend.FLASH_ATTN:
+            from flash_attn import flash_attn_varlen_func
+
+            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
+
+            output = flash_attn_varlen_func(
+                q,
+                k,
+                v,
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_k=cu_seqlens,
+                max_seqlen_q=max_seqlen,
+                max_seqlen_k=max_seqlen,
+                causal=False,
+                softmax_scale=self.scale,
+            )
+            context_layer = rearrange(output,
+                                      "(b s) ... -> b s ...",
+                                      b=batch_size)
+        elif self.attn_backend == _Backend.XFORMERS:
+            from xformers import ops as xops
+            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+
+            attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
+                                                       kv_seqlen=None,
+                                                       device=q.device)
+
+            context_layer = xops.memory_efficient_attention_forward(
+                q, k, v, attn_bias=attn_bias, p=0, scale=None)
+
+        context_layer = rearrange(context_layer,
+                                  "b s h d -> b s (h d)").contiguous()
+
+        output, _ = self.out_proj(context_layer)
+        return output
+
+
+class SigLIPRotaryEmbedding(nn.Module):
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.rope_init()
+
+    def rope_init(self):
+        inv_freq = 1.0 / (self.theta**(
+            torch.arange(0, self.dim, 2, dtype=torch.float) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(
+            seqlen,
+            device=self.inv_freq.device,
+            dtype=self.inv_freq.dtype,
+        )
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+
+
+class KeyeSiglipEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Union[PretrainedConfig],
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+        self.self_attn = KeyeSiglipAttention(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+        cu_seqlens: Optional[list[torch.Tensor]] = None,
+        rope_emb: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> tuple[torch.FloatTensor]:
+
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            cu_seqlens=cu_seqlens,
+            rope_emb=rope_emb,
+        )
+
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class KeyeSiglipEncoder(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        num_heads = config.num_attention_heads
+        head_dim = embed_dim // num_heads
+        self.layers = nn.ModuleList([
+            KeyeSiglipEncoderLayer(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.layers.{layer_idx}",
+            ) for layer_idx in range(config.num_hidden_layers)
+        ])
+        self.rotary_pos_emb = SigLIPRotaryEmbedding(head_dim // 2)
+
+    @staticmethod
+    def flatten_list(image_grid_thw):
+        tmp_image_grid_thw = list()
+        for image_grid in image_grid_thw:
+            if isinstance(image_grid, list):
+                tmp_image_grid_thw.extend(image_grid)
+            else:
+                tmp_image_grid_thw.append(image_grid)
+        return tmp_image_grid_thw
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cu_seqlens: Optional[list[torch.Tensor]] = None,
+        image_grid_thw: Optional[list[Union[
+            tuple[int, int, int],
+            list[tuple[int, int, int]],
+        ]]] = None,
+        height_position_ids: Optional[torch.Tensor] = None,
+        width_position_ids: Optional[torch.Tensor] = None,
+        use_rope: Optional[bool] = False,
+        window_size: Optional[bool] = -1,
+        vision_or_text: str = "vision",
+    ) -> BaseModelOutput:
+        device = inputs_embeds.device
+        hidden_states = inputs_embeds
+        if use_rope is True:
+            flatten_image_grid_thw = self.flatten_list(image_grid_thw)
+
+            if width_position_ids is None or height_position_ids is None:
+                split_hids = list()
+                split_wids = list()
+                for t, h, w in flatten_image_grid_thw:
+                    image_pids = torch.arange(t * h * w,
+                                              device=device) % (h * w)
+                    sample_hids = image_pids // w
+                    sample_wids = image_pids % w
+                    split_hids.append(sample_hids)
+                    split_wids.append(sample_wids)
+                width_position_ids = torch.concat(split_wids, dim=0)
+                height_position_ids = torch.concat(split_hids, dim=0)
+
+            pids = torch.stack(
+                [height_position_ids, width_position_ids],
+                dim=-1,
+            )
+            max_grid_size = pids.max() + 1
+            rope_emb_max_grid = self.rotary_pos_emb(max_grid_size)
+            rope_emb = rope_emb_max_grid[pids].flatten(1)
+            rope_emb = rope_emb.repeat(1, 2)
+            rope_emb = (rope_emb.cos(), rope_emb.sin())
+        else:
+            rope_emb = None
+
+        attn_cu_seqlens = cu_seqlens
+        hidden_states = inputs_embeds
+        assert attention_mask is None
+
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(
+                hidden_states,
+                attention_mask,
+                output_attentions=output_attentions,
+                cu_seqlens=attn_cu_seqlens,
+                rope_emb=rope_emb,
+            )
+        return hidden_states
+
+
+class KeyeSiglipVisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = KeyeVisionEmbeddings(config)
+        self.encoder = KeyeSiglipEncoder(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.encoder",
+        )
+        self.post_layernorm = nn.LayerNorm(embed_dim,
+                                           eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        pixel_values,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = False,
+        attention_mask: Optional[torch.Tensor] = None,
+        sample_indices: Optional[torch.Tensor] = None,
+        image_indices: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        height_position_ids: Optional[torch.Tensor] = None,
+        width_position_ids: Optional[torch.Tensor] = None,
+        cu_seqlens: Optional[list[torch.Tensor]] = None,
+        padding_mask: Optional[torch.Tensor] = None,
+        vision_return_embed_list: Optional[bool] = False,
+        image_grid_thw: Optional[list[Union[
+            tuple[int, int, int],
+            list[tuple[int, int, int]],
+        ]]] = None,
+        return_pooler_output: Optional[bool] = True,
+        use_rope: Optional[bool] = False,
+        window_size: Optional[bool] = -1,
+    ) -> BaseModelOutputWithPooling:
+
+        hidden_states = self.embeddings(
+            pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            position_ids=position_ids,
+            image_grid_thw=image_grid_thw,
+        )
+
+        last_hidden_state = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            attention_mask=attention_mask,
+            cu_seqlens=cu_seqlens,
+            image_grid_thw=image_grid_thw,
+            use_rope=use_rope,
+            height_position_ids=height_position_ids,
+            width_position_ids=width_position_ids,
+            window_size=window_size,
+            vision_or_text="vision",
+        )
+
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        sample_hidden_state = list()
+        if cu_seqlens is None:
+            raise ValueError("cu_seqlens cannot be None for "
+                             "SiglipVisionTransformer output processing.")
+        for i in range(cu_seqlens.shape[0] - 1):
+            start = cu_seqlens[i]
+            end = cu_seqlens[i + 1]
+            tensor = last_hidden_state[:, start:end, :].squeeze(0)
+            sample_hidden_state.append(tensor)
+
+        return sample_hidden_state
+
+
+class KeyeSiglipVisionModel(nn.Module):
+    config_class = PretrainedConfig
+    main_input_name = "pixel_values"
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.vision_model = KeyeSiglipVisionTransformer(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.vision_model",
+        )
+        self.quant_config = quant_config
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.vision_model.embeddings.patch_embedding.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.vision_model.embeddings.patch_embedding.weight.device
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    def forward(
+        self,
+        pixel_values,
+        sample_indices: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        position_ids: Optional[torch.Tensor] = None,
+        vision_return_embed_list: Optional[bool] = False,
+        image_grid_thw: Optional[list[Union[
+            tuple[int, int, int],
+            list[tuple[int, int, int]],
+        ]]] = None,
+        cu_seqlens: Optional[list[torch.Tensor]] = None,
+        return_pooler_output: Optional[bool] = True,
+        use_rope: Optional[bool] = False,
+        window_size: Optional[bool] = -1,
+    ) -> BaseModelOutputWithPooling:
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            position_ids=position_ids,
+            vision_return_embed_list=vision_return_embed_list,
+            image_grid_thw=image_grid_thw,
+            sample_indices=sample_indices,
+            cu_seqlens=cu_seqlens,
+            return_pooler_output=return_pooler_output,
+            use_rope=use_rope,
+            window_size=window_size,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "head.attention" in name or "head.layernorm" in name:
+                continue
+            if "head.mlp" in name or "head.probe" in name:
+                continue
+            if self.quant_config is not None and (
+                    scale_name := self.quant_config.get_cache_scale(name)):
+                param = params_dict[scale_name]
+                weight_loader = getattr(
+                    param,
+                    "weight_loader",
+                    default_weight_loader,
+                )
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for (
+                    param_name,
+                    weight_name,
+                    shard_id,
+            ) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(
+                    param,
+                    "weight_loader",
+                    default_weight_loader,
+                )
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Projector(nn.Module):
+
+    def __init__(
+        self,
+        text_config: PretrainedConfig,
+        vision_config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.merge_kernel_size = (2, 2)
+
+        self.hidden_size = (self.vision_config.hidden_size *
+                            self.merge_kernel_size[0] *
+                            self.merge_kernel_size[1])
+
+        self.pre_norm = torch.nn.LayerNorm(self.vision_config.hidden_size,
+                                           eps=1e-05)
+        self.act = GELUActivation()
+
+        self.linear_1 = ColumnParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear_1",
+        )
+        self.linear_2 = RowParallelLinear(
+            self.hidden_size,
+            self.text_config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear_2",
+        )
+
+    def forward(
+        self,
+        image_features: torch.Tensor,
+        image_grid_thw: list[tuple[int, int, int]],
+    ) -> torch.Tensor:
+        m1, m2 = self.merge_kernel_size
+        if isinstance(image_features, (list, tuple)):
+            processed_features = list()
+            for image_feature, image_grid in zip(image_features,
+                                                 image_grid_thw):
+                image_feature = self.pre_norm(image_feature)
+                t, h, w = image_grid
+
+                image_feature = rearrange(
+                    image_feature,
+                    "(t h p1 w p2) d -> (t h w) (p1 p2 d)",
+                    t=t,
+                    h=h // m1,
+                    p1=m1,
+                    w=w // m2,
+                    p2=m2,
+                )
+                hidden_states, _ = self.linear_1(image_feature)
+                hidden_states = self.act(hidden_states)
+                hidden_states, _ = self.linear_2(hidden_states)
+                processed_features.append(hidden_states)
+
+            return processed_features
+
+        dims = image_features.shape[:-1]
+        dim = image_features.shape[-1]
+        image_features = image_features.view(np.prod(dims), dim)
+        hidden_states = self.pre_norm(image_features).view(
+            -1, self.hidden_size)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+
+        return hidden_states.view(*dims, -1)
+
+
+def _keye_field_config(hf_inputs: Mapping[str, torch.Tensor], ):
+    image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
+    image_grid_sizes = image_grid_thw.prod(-1)
+
+    video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
+    video_grid_sizes = video_grid_thw.prod(-1)
+
+    return dict(
+        pixel_values=MultiModalFieldConfig.flat_from_sizes(
+            "image", image_grid_sizes),
+        image_embeds=MultiModalFieldConfig.flat_from_sizes(
+            "image", image_grid_sizes),
+        image_grid_thw=MultiModalFieldConfig.batched("image"),
+        pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
+            "video", video_grid_sizes),
+        video_embeds=MultiModalFieldConfig.flat_from_sizes(
+            "video", video_grid_sizes),
+        video_grid_thw=MultiModalFieldConfig.batched("video"),
+    )
+
+
+class KeyeMultiModalDataParser(MultiModalDataParser):
+
+    def _parse_image_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return DictEmbeddingItems(
+                data,
+                modality="image",
+                required_fields={
+                    "image_embeds",
+                    "image_grid_thw",
+                },
+                fields_factory=_keye_field_config,
+            )
+
+        return super()._parse_image_data(data)
+
+    def _parse_video_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return DictEmbeddingItems(
+                data,
+                modality="video",
+                required_fields={
+                    "video_embeds",
+                    "video_grid_thw",
+                },
+                fields_factory=_keye_field_config,
+            )
+
+        return super()._parse_video_data(data)
+
+
+class KeyeProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(PretrainedConfig)
+
+    def get_hf_processor(
+        self,
+        *,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        size: Optional[dict[str, int]] = None,
+        **kwargs: object,
+    ):
+        return self.ctx.get_hf_processor(
+            image_processor=self.get_image_processor(
+                min_pixels=min_pixels,
+                max_pixels=max_pixels,
+                size=size,
+            ),
+            **kwargs,
+        )
+
+    def _get_image_processor_kwargs(
+        self,
+        *,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        size: Optional[dict[str, int]] = None,
+        **kwargs: object,
+    ):
+        if self.ctx.model_config.mm_processor_kwargs:
+            kwargs.update(self.ctx.model_config.mm_processor_kwargs)
+
+        if min_pixels is not None:
+            kwargs["min_pixels"] = min_pixels
+
+            if size is None:
+                size = {"shortest_edge": min_pixels}
+            else:
+                size["shortest_edge"] = min_pixels
+
+        if max_pixels is not None:
+            kwargs["max_pixels"] = max_pixels
+
+            if size is None:
+                size = {"longest_edge": max_pixels}
+            else:
+                size["longest_edge"] = max_pixels
+
+        if size is not None:
+            kwargs["size"] = size
+
+        return kwargs
+
+    def get_image_processor(
+        self,
+        *,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        size: Optional[dict[str, int]] = None,
+        **kwargs: object,
+    ):
+        return cached_image_processor_from_config(
+            self.ctx.model_config,
+            **self._get_image_processor_kwargs(
+                min_pixels=min_pixels,
+                max_pixels=max_pixels,
+                size=size,
+                **kwargs,
+            ),
+        )
+
+    def get_supported_mm_limits(self, ) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None}
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {
+            "image": self.get_max_image_tokens(),
+            "video": self.get_max_video_tokens(seq_len),
+        }
+
+    def _get_vision_info(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int = 1,
+        do_resize: bool = True,
+        image_processor,
+    ) -> tuple[ImageSize, int]:
+        if image_processor is None:
+            image_processor = self.get_image_processor()
+
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+        patch_size = vision_config.patch_size
+        merge_size = vision_config.spatial_merge_size
+        temporal_patch_size = 1
+
+        if do_resize:
+            resized_height, resized_width = smart_resize(
+                height=image_height,
+                width=image_width,
+                factor=patch_size * merge_size,
+                min_pixels=image_processor.min_pixels,
+                max_pixels=image_processor.max_pixels,
+            )
+            preprocessed_size = ImageSize(width=resized_width,
+                                          height=resized_height)
+        else:
+            preprocessed_size = ImageSize(width=image_width,
+                                          height=image_height)
+
+        padded_num_frames = num_frames + num_frames % temporal_patch_size
+
+        grid_t = max(padded_num_frames // temporal_patch_size, 1)
+        grid_h = preprocessed_size.height // patch_size
+        grid_w = preprocessed_size.width // patch_size
+
+        num_patches = grid_t * grid_h * grid_w
+        num_vision_tokens = num_patches // (merge_size**2)
+
+        return preprocessed_size, num_vision_tokens
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        image_processor,
+    ) -> int:
+        _, num_image_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+            image_processor=image_processor,
+        )
+        return num_image_tokens
+
+    def get_num_video_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int,
+        image_processor,
+    ) -> int:
+        _, num_video_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+            num_frames=num_frames,
+            image_processor=image_processor,
+        )
+        return num_video_tokens
+
+    def get_image_size_with_most_features(self, ) -> ImageSize:
+        max_image_size, _ = self._get_vision_info(
+            image_width=_MAX_IMAGE_SIZE,
+            image_height=_MAX_IMAGE_SIZE,
+            image_processor=None,
+        )
+        return max_image_size
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            image_processor=None,
+        )
+
+    def _get_max_video_frames(self, max_tokens: int) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        num_frames = 0
+
+        while True:
+            next_num_frames = num_frames + 1
+            next_max_tokens = self.get_num_video_tokens(
+                image_width=target_width,
+                image_height=target_height,
+                num_frames=next_num_frames,
+                image_processor=None,
+            )
+
+            if next_max_tokens > max_tokens:
+                break
+
+            num_frames = next_num_frames
+
+        return num_frames
+
+    def get_num_frames_with_most_features(self, seq_len: int) -> int:
+        mm_config = self.ctx.get_mm_config()
+        max_images = mm_config.get_limit_per_prompt("image")
+        max_videos = mm_config.get_limit_per_prompt("video")
+
+        max_image_tokens = self.get_max_image_tokens() * max_images
+        max_total_frames = self._get_max_video_frames(seq_len -
+                                                      max_image_tokens)
+        max_frames_per_video = min(
+            max_total_frames // max(max_videos, 1),
+            _MAX_FRAMES_PER_VIDEO,
+        )
+
+        return max(max_frames_per_video, 1)
+
+    def get_max_video_tokens(self, seq_len: int) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_video_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            num_frames=self.get_num_frames_with_most_features(seq_len),
+            image_processor=None,
+        )
+
+
+class KeyeDummyInputsBuilder(BaseDummyInputsBuilder[KeyeProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        hf_processor = self.info.get_hf_processor()
+        image_token: str = hf_processor.image_token
+        video_token: str = hf_processor.video_token
+
+        return image_token * num_images + video_token * num_videos
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        target_width, target_height = (
+            self.info.get_image_size_with_most_features())
+        target_num_frames = self.info.get_num_frames_with_most_features(
+            seq_len)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+            ),
+            "video":
+            self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=target_num_frames,
+                num_videos=num_videos,
+            ),
+        }
+
+        return mm_data
+
+
+class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return KeyeMultiModalDataParser()
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        return self.info.ctx.call_hf_processor(
+            self.info.get_hf_processor(**mm_kwargs),
+            dict(text=prompt, **mm_data),
+            self.info._get_image_processor_kwargs(**mm_kwargs),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_processor = self.info.get_image_processor(
+            **hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+
+        placeholder = {
+            "image": vocab[hf_processor.image_token],
+            "video": vocab[hf_processor.video_token],
+        }
+
+        merge_length = image_processor.merge_size**2
+
+        def get_replacement_keye(item_idx: int, modality: str):
+            grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx]
+            assert isinstance(grid_thw, torch.Tensor)
+
+            num_tokens = int(grid_thw.prod()) // merge_length
+            return [placeholder[modality]] * num_tokens
+
+        return [
+            PromptReplacement(
+                modality=modality,
+                target=[placeholder[modality]],
+                replacement=partial(get_replacement_keye, modality=modality),
+            ) for modality in ("image", "video")
+        ]
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _keye_field_config(hf_inputs)
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    KeyeMultiModalProcessor,
+    info=KeyeProcessingInfo,
+    dummy_inputs=KeyeDummyInputsBuilder,
+)
+class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
+                                   SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
+        "lm_head.": "language_model.lm_head.",
+        "model.": "language_model.model.",
+    })
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: PretrainedConfig = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.visual = KeyeSiglipVisionModel(
+            config.vision_config,
+            quant_config=self._maybe_ignore_quant_config(quant_config),
+            prefix=maybe_prefix(prefix, "visual"),
+        )
+        self.mlp_AR = Projector(
+            config,
+            config.vision_config,
+            quant_config=self._maybe_ignore_quant_config(quant_config),
+            prefix=maybe_prefix(prefix, "mlp_AR"),
+        )
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            architectures=["Qwen3ForCausalLM"],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
+        if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
+            return None
+        return quant_config
+
+    def _validate_and_reshape_mm_tensor(self, mm_input: object,
+                                        name: str) -> torch.Tensor:
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of {name}. "
+                             f"Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            if mm_input.ndim == 2:
+                return mm_input
+            if mm_input.ndim == 5:
+                return mm_input
+            if mm_input.ndim != 3:
+                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
+                                 f"Got ndim: {mm_input.ndim} "
+                                 f"(shape={mm_input.shape})")
+            return torch.concat(list(mm_input))
+        else:
+            return torch.concat(mm_input)
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[KeyeImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            pixel_values = self._validate_and_reshape_mm_tensor(
+                pixel_values, "image pixel values")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return KeyeImagePixelInputs(
+                type="pixel_values",
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_thw,
+            )
+
+        if image_embeds is not None:
+            image_embeds = self._validate_and_reshape_mm_tensor(
+                image_embeds, "image embeds")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(image_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+            return KeyeImageEmbeddingInputs(
+                type="image_embeds",
+                image_embeds=image_embeds,
+                image_grid_thw=image_grid_thw,
+            )
+
+    def _parse_and_validate_video_input(
+            self, **kwargs: object) -> Optional[KeyeVideoInputs]:
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+        video_embeds = kwargs.pop("video_embeds", None)
+        video_grid_thw = kwargs.pop("video_grid_thw", None)
+
+        if pixel_values_videos is None and video_embeds is None:
+            return None
+
+        if pixel_values_videos is not None:
+            pixel_values_videos = self._validate_and_reshape_mm_tensor(
+                pixel_values_videos,
+                "video pixel values",
+            )
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            return KeyeVideoPixelInputs(
+                type="pixel_values_videos",
+                pixel_values_videos=pixel_values_videos,
+                video_grid_thw=video_grid_thw,
+            )
+
+        if video_embeds is not None:
+            video_embeds = self._validate_and_reshape_mm_tensor(
+                video_embeds, "video embeds")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            if not isinstance(video_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of video embeddings. "
+                                 f"Got type: {type(video_embeds)}")
+            return KeyeVideoEmbeddingInputs(
+                type="video_embeds",
+                video_embeds=video_embeds,
+                video_grid_thw=video_grid_thw,
+            )
+
+    def _process_image_input(
+            self, image_input: KeyeImageInputs) -> tuple[torch.Tensor, ...]:
+        siglip_position_ids = list()
+        image_grid_hws = list()
+        sample_indices = list()
+        cu_seqlens = [0]
+
+        image_grid_thw = image_input["image_grid_thw"]
+        assert image_grid_thw.ndim == 2
+
+        for idx, thaw in enumerate(image_grid_thw):
+            thw_tuple = tuple(thaw.detach().cpu().numpy().tolist())
+            numel = np.prod(thw_tuple)
+            image_grid_hws.append(thw_tuple)
+            image_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:])
+            siglip_position_ids.append(image_position_ids)
+            sample_indices.append(torch.full((numel, ), idx,
+                                             dtype=torch.int64))
+            cu_seqlens.append(cu_seqlens[-1] + numel)
+
+        if image_input["type"] == "image_embeds":
+            raise ValueError(
+                "Image embeddings are not supported for this processing path.")
+        else:
+            pixel_values = image_input["pixel_values"].type(self.visual.dtype)
+            siglip_position_ids = torch.concat(siglip_position_ids,
+                                               dim=0).to(pixel_values.device)
+            cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to(
+                pixel_values.device)
+            sample_indices = torch.concat(sample_indices,
+                                          dim=0).to(pixel_values.device)
+
+            image_embeds = self.visual(
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_hws,
+                position_ids=siglip_position_ids,
+                vision_return_embed_list=False,
+                interpolate_pos_encoding=True,
+                sample_indices=sample_indices,
+                cu_seqlens=cu_seqlens,
+                use_rope=True,
+                window_size=-1,
+            )
+            image_embeds = tuple(self.mlp_AR(image_embeds, image_grid_thw))
+            return image_embeds
+
+    def _process_video_input(
+            self, video_input: KeyeVideoInputs) -> tuple[torch.Tensor, ...]:
+        siglip_position_ids = list()
+        video_grid_hws = list()
+        sample_indices = list()
+        cu_seqlens = [0]
+
+        video_grid_thw = video_input["video_grid_thw"]
+        assert video_grid_thw.ndim == 2
+
+        for idx, thaw in enumerate(video_grid_thw):
+            thw_tuple = tuple(thaw.detach().cpu().numpy().tolist())
+            numel = np.prod(thw_tuple)
+
+            video_grid_hws.append(thw_tuple)
+            video_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:])
+            siglip_position_ids.append(video_position_ids)
+            sample_indices.append(torch.full((numel, ), idx,
+                                             dtype=torch.int64))
+            cu_seqlens.append(cu_seqlens[-1] + numel)
+
+        if video_input["type"] == "video_embeds":
+            raise ValueError(
+                "Video embeddings are not supported for this processing path.")
+        else:
+            pixel_values_videos = video_input["pixel_values_videos"].type(
+                self.visual.dtype)
+            siglip_position_ids = torch.concat(siglip_position_ids, dim=0).to(
+                pixel_values_videos.device)
+            cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to(
+                pixel_values_videos.device)
+            sample_indices = torch.concat(sample_indices,
+                                          dim=0).to(pixel_values_videos.device)
+
+            video_embeds = self.visual(
+                pixel_values=pixel_values_videos,
+                image_grid_thw=video_grid_hws,
+                position_ids=siglip_position_ids,
+                vision_return_embed_list=True,
+                interpolate_pos_encoding=True,
+                sample_indices=sample_indices,
+                cu_seqlens=cu_seqlens,
+                use_rope=True,
+                window_size=-1,
+            )
+            video_embeds = tuple(self.mlp_AR(video_embeds, video_grid_thw))
+            return video_embeds
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        for input_key in kwargs:
+            if (input_key in ("pixel_values", "image_embeds")
+                    and "images" not in modalities):
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if (input_key in ("pixel_values_videos", "video_embeds")
+                    and "videos" not in modalities):
+                modalities["videos"] = self._parse_and_validate_video_input(
+                    **kwargs)
+
+        return modalities
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return None
+
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += vision_embeddings
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_embeddings = self._process_video_input(video_input)
+                multimodal_embeddings += video_embeddings
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                [
+                    self.config.image_token_id,
+                    self.config.video_token_id,
+                ],
+            )
+        return inputs_embeds
+
+    def get_input_embeddings_v0(
+        self,
+        input_ids: torch.Tensor,
+        image_input: Optional[KeyeImagePixelInputs] = None,
+        video_input: Optional[KeyeVideoPixelInputs] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.get_input_embeddings(input_ids)
+        if image_input is not None:
+            image_embeds = self._process_image_input(image_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                image_embeds,
+                placeholder_token_id=self.config.image_token_id,
+            )
+
+        if video_input is not None:
+            video_embeds = self._process_video_input(video_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                video_embeds,
+                placeholder_token_id=self.config.video_token_id,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """Run forward pass for Qwen2-VL.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            positions: Flattened (concatenated) position ids corresponding to a
+                batch.
+                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
+                opensource models), the shape will be `(3, seq_len)`,
+                otherwise it will be `(seq_len,).
+            pixel_values: Pixel values to be fed to a model.
+                `None` if no images are passed.
+            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
+                `None` if no images are passed.
+            pixel_values_videos: Pixel values of videos to be fed to a model.
+                `None` if no videos are passed.
+            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
+                `None` if no videos are passed.
+        """
+
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        elif inputs_embeds is None:
+            image_input = self._parse_and_validate_image_input(**kwargs)
+            video_input = self._parse_and_validate_video_input(**kwargs)
+
+            if image_input is None and video_input is None:
+                inputs_embeds = None
+            else:
+                if uses_mrope(self.config):
+                    assert positions.ndim == 2 and positions.size(0) == 3, (
+                        "multimodal section rotary embedding requires "
+                        f"(3, seq_len) positions, but got {positions.size()}")
+                inputs_embeds = self.get_input_embeddings_v0(
+                    input_ids,
+                    image_input=image_input,
+                    video_input=video_input,
+                )
+                input_ids = None
+
+        hidden_states = self.language_model.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """Get the module prefix in multimodal models."""
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="visual.",
+            tower_model="mlp_AR.",
+        )
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 5d0a1d77339ab..8aefbc206d80c 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -197,6 +197,7 @@ _MULTIMODAL_MODELS = {
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
     "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
     "SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"),  # noqa: E501
+    "KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"),
     "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"),  # noqa: E501
     "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501

From ba51aea65e0d2a7afca3f25caba01500fa84d655 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 2 Jul 2025 14:46:59 +0800
Subject: [PATCH 041/167] [Bugfix] Keye-VL compatibility with `tok_kwargs`
 (#20058) (#20353)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/keye.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 84996c44aed4d..07dcc3a72ad35 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -1268,11 +1268,13 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
         prompt: str,
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
     ) -> BatchFeature:
+        mm_kwargs = self.info._get_image_processor_kwargs(**mm_kwargs)
         return self.info.ctx.call_hf_processor(
             self.info.get_hf_processor(**mm_kwargs),
             dict(text=prompt, **mm_data),
-            self.info._get_image_processor_kwargs(**mm_kwargs),
+            dict(**mm_kwargs, **tok_kwargs),
         )
 
     def _get_prompt_updates(

From d853520b3e47136c3e434cfcfea0473d352f0971 Mon Sep 17 00:00:00 2001
From: Michael Yao <haifeng.yao@daocloud.io>
Date: Wed, 2 Jul 2025 14:50:31 +0800
Subject: [PATCH 042/167] [Docs] Fix indentations for 2-level items in
 deprecation_policy.md (#20352)

Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
---
 docs/contributing/deprecation_policy.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/contributing/deprecation_policy.md b/docs/contributing/deprecation_policy.md
index 598f1612d3af3..ff69cbae08b23 100644
--- a/docs/contributing/deprecation_policy.md
+++ b/docs/contributing/deprecation_policy.md
@@ -37,14 +37,14 @@ multiple Y releases:
 - **Timeline**: A removal version is explicitly stated in the deprecation
 warning (e.g., "This will be removed in v0.10.0").
 - **Communication**: Deprecation is noted in the following, as applicable:
-  - Help strings
-  - Log output
-  - API responses
-  - `/metrics` output (for metrics features)
-  - User-facing documentation
-  - Release notes
-  - GitHub Issue (RFC) for feedback
-  - Documentation and use of the `@typing_extensions.deprecated` decorator for Python APIs
+    - Help strings
+    - Log output
+    - API responses
+    - `/metrics` output (for metrics features)
+    - User-facing documentation
+    - Release notes
+    - GitHub Issue (RFC) for feedback
+    - Documentation and use of the `@typing_extensions.deprecated` decorator for Python APIs
 
 **2.Deprecated (Off By Default)**
 

From ae9c4d416fb6406eddd3337af63ad4d42027af46 Mon Sep 17 00:00:00 2001
From: Michael Yao <haifeng.yao@daocloud.io>
Date: Wed, 2 Jul 2025 17:04:08 +0800
Subject: [PATCH 043/167] [Docs] Make TPU ref prettier in google_tpu.md
 (#20356)

Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
---
 docs/getting_started/installation/google_tpu.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/getting_started/installation/google_tpu.md b/docs/getting_started/installation/google_tpu.md
index a81a19df38b09..5dc2a7c93f4ef 100644
--- a/docs/getting_started/installation/google_tpu.md
+++ b/docs/getting_started/installation/google_tpu.md
@@ -58,9 +58,9 @@ assigned to your Google Cloud project for your immediate exclusive use.
 
 For more information about using TPUs with GKE, see:
 
-- <https://cloud.google.com/kubernetes-engine/docs/how-to/tpus>
-- <https://cloud.google.com/kubernetes-engine/docs/concepts/tpus>
-- <https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus>
+- [About TPUs in GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/tpus)
+- [Deploy TPU workloads in GKE Standard](https://cloud.google.com/kubernetes-engine/docs/how-to/tpus)
+- [Plan for TPUs in GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus)
 
 ## Configure a new environment
 

From e303dcf523344a982c5e6937ddc8946b2b9860cc Mon Sep 17 00:00:00 2001
From: CSWYF3634076 <58356743+CSWYF3634076@users.noreply.github.com>
Date: Wed, 2 Jul 2025 18:37:01 +0800
Subject: [PATCH 044/167] [Model] Add Ernie4.5 and Ernie4.5MoE Model Support
 (#20220)

Signed-off-by: wangyafeng <wangyafeng@baidu.com>
---
 docs/models/supported_models.md           |   2 +
 tests/models/registry.py                  |   4 +
 vllm/model_executor/models/ernie45.py     |  43 ++
 vllm/model_executor/models/ernie45_moe.py | 583 ++++++++++++++++++++++
 vllm/model_executor/models/registry.py    |   2 +
 5 files changed, 634 insertions(+)
 create mode 100644 vllm/model_executor/models/ernie45.py
 create mode 100644 vllm/model_executor/models/ernie45_moe.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 4c279b795ade0..6ccfdcd89ddc1 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -330,6 +330,8 @@ Specified using `--task generate`.
 | `DeepseekV2ForCausalLM`                           | DeepSeek-V2                                         | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc.                                                                                                               |                        | ✅︎                          | ✅︎                     |
 | `DeepseekV3ForCausalLM`                           | DeepSeek-V3                                         | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc.                                                                                                               |                        | ✅︎                          | ✅︎                     |
 | `Dots1ForCausalLM`                                | dots.llm1                                           | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst` etc.                                                                                                               |                        | ✅︎                          | ✅︎                     |
+| `Ernie4_5_ForCausalLM`                            | Ernie4.5                                            | `baidu/ERNIE-4.5-0.3B-PT`,etc.                                                                                                               |                        | ✅︎                          | ✅︎                     |
+| `Ernie4_5_MoeForCausalLM`                         | Ernie4.5MoE                                         | `baidu/ERNIE-4.5-21B-A3B-PT`,  `baidu/ERNIE-4.5-300B-A47B-PT`, etc.                                                                                                               |                        | ✅︎                          | ✅︎                     |
 | `ExaoneForCausalLM`                               | EXAONE-3                                            | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc.                                                                                                                                 | ✅︎                     | ✅︎                          | ✅︎                     |
 | `FalconForCausalLM`                               | Falcon                                              | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.                                                                                                         |                        | ✅︎                          | ✅︎                     |
 | `FalconMambaForCausalLM`                          | FalconMamba                                         | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc.                                                                                                            |                        | ✅︎                          | ✅︎                     |
diff --git a/tests/models/registry.py b/tests/models/registry.py
index c395e627ab2a5..704aa76b84d4b 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -162,6 +162,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                          trust_remote_code=True),
     "DeepseekV3ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V3",  # noqa: E501
                                          trust_remote_code=True),
+    "Ernie4_5_ForCausalLM": _HfExamplesInfo("baidu/ERNIE-4.5-0.3B-PT",
+                                        trust_remote_code=True),
+    "Ernie4_5_MoeForCausalLM": _HfExamplesInfo("baidu/ERNIE-4.5-21B-A3B-PT",
+                                        trust_remote_code=True),
     "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"),  # noqa: E501
     "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"),  # noqa: E501
     "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
diff --git a/vllm/model_executor/models/ernie45.py b/vllm/model_executor/models/ernie45.py
new file mode 100644
index 0000000000000..2a89fffe80e35
--- /dev/null
+++ b/vllm/model_executor/models/ernie45.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The Baidu team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Erine model compatible with HuggingFace weights."""
+from vllm.config import VllmConfig
+from vllm.model_executor.models.llama import LlamaForCausalLM
+
+from .utils import PPMissingLayer
+
+
+class Ernie4_5_ForCausalLM(LlamaForCausalLM):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        # Hack Llama model to fit HF format Ernie4.5 dense implementation
+        # Attention difference between Ernie and Llama:
+        # 1. rotary_dim and no Neox style.
+        # 2. There is no bias for o_proj in attention
+        for layer in self.model.layers:
+            if not isinstance(layer, PPMissingLayer):
+                layer.self_attn.rotary_emb.is_neox_style = False
+                layer.self_attn.o_proj.bias = None
+                layer.self_attn.o_proj.skip_bias_add = True
diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py
new file mode 100644
index 0000000000000..e7a50ff7a1c95
--- /dev/null
+++ b/vllm/model_executor/models/ernie45_moe.py
@@ -0,0 +1,583 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The Baidu team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only ErineMoE model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP
+from .utils import (PPMissingLayer, extract_layer_index,
+                    is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+logger = init_logger(__name__)
+
+
+class Ernie4_5_MoeMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        use_bias: bool = False,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=use_bias,
+                                           quant_config=quant_config,
+                                           reduce_results=reduce_results,
+                                           prefix=f"{prefix}.down_proj")
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Ernie4_5_MoeMoE(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        layer_idx = extract_layer_index(prefix)
+        self.layer_idx = layer_idx
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.moe_num_shared_experts = getattr(config, "moe_num_shared_experts",
+                                              None)
+
+        if self.tp_size > config.moe_num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.moe_num_experts}.")
+
+        self.gate = ReplicatedLinear(config.hidden_size,
+                                     config.moe_num_experts,
+                                     bias=False,
+                                     quant_config=None,
+                                     prefix=f"{prefix}.gate")
+
+        self.experts = FusedMoE(num_experts=config.moe_num_experts,
+                                top_k=config.moe_k,
+                                hidden_size=config.hidden_size,
+                                intermediate_size=config.moe_intermediate_size,
+                                reduce_results=False,
+                                renormalize=True,
+                                quant_config=quant_config,
+                                prefix=f"{prefix}.experts")
+
+        if self.moe_num_shared_experts is not None:
+            intermediate_size = (config.moe_intermediate_size *
+                                 config.moe_num_shared_experts)
+            self.shared_experts = Ernie4_5_MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.shared_experts",
+                reduce_results=self.experts.must_reduce_shared_expert_outputs(
+                ))
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        if self.moe_num_shared_experts is not None:
+            shared_output = self.shared_experts(hidden_states)
+
+        router_logits, _ = self.gate(hidden_states)
+
+        final_hidden_states = self.experts(hidden_states=hidden_states,
+                                           router_logits=router_logits)
+
+        if self.moe_num_shared_experts is not None and \
+              shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+
+        if self.tp_size > 1:
+            final_hidden_states = (
+                self.experts.maybe_all_reduce_tensor_model_parallel(
+                    final_hidden_states))
+
+        return final_hidden_states.view(orig_shape)
+
+
+class Ernie4_5_MoeAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: Optional[int] = None,
+        rope_theta: float = 500000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 131072,
+        rms_norm_eps: float = 1e-05,
+        qkv_bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        layer_idx = extract_layer_index(prefix) if len(prefix) > 0 else 0
+        self.layer_idx = layer_idx
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or (hidden_size // self.total_num_heads)
+
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(hidden_size,
+                                          self.head_dim,
+                                          self.total_num_heads,
+                                          self.total_num_kv_heads,
+                                          bias=qkv_bias,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.qkv_proj")
+
+        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
+                                        hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            is_neox_style=False,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+
+        qkv, _ = self.qkv_proj(hidden_states)
+
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+
+        # Attention
+        attn_output = self.attn(q, k, v)
+        # Output projection
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Ernie4_5_MoeDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 500000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          131072)
+        self.self_attn = Ernie4_5_MoeAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=getattr(config, 'head_dim', None),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            rms_norm_eps=config.rms_norm_eps,
+            qkv_bias=getattr(config, 'use_bias', False),
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        layer_idx = extract_layer_index(prefix)
+        self.layer_idx = layer_idx
+
+        # MoE
+        moe_num_experts = getattr(config, "moe_num_experts", 0)
+        moe_layer_start_index = getattr(config, "moe_layer_start_index", 0)
+        moe_layer_end_index = getattr(config, "moe_layer_end_index",
+                                      config.num_hidden_layers - 1)
+        moe_layer_interval = getattr(config, "moe_layer_interval", 1)
+        use_moe = getattr(config, "use_moe", moe_num_experts > 0)
+
+        if (use_moe and ((layer_idx + 1) % moe_layer_interval == 0)
+                and layer_idx >= moe_layer_start_index
+                and layer_idx <= moe_layer_end_index):
+            self.mlp = Ernie4_5_MoeMoE(config=config,
+                                       quant_config=quant_config,
+                                       prefix=f"{prefix}.mlp")
+        else:
+            self.mlp = Ernie4_5_MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                use_bias=getattr(config, 'use_bias', False),
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp")
+
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+
+        hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+@support_torch_compile
+class Ernie4_5_MoeModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.config = config
+
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.embed_tokens")
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Ernie4_5_MoeDecoderLayer(config=config,
+                                                    cache_config=cache_config,
+                                                    quant_config=quant_config,
+                                                    prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class Ernie4_5_MoeForCausalLM(nn.Module, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Ernie4_5_MoeModel(vllm_config=vllm_config,
+                                       prefix=maybe_prefix(prefix, "model"))
+
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(config.vocab_size,
+                                          config.hidden_size,
+                                          quant_config=quant_config)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.moe_num_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if self.config.tie_word_embeddings and name.endswith(
+                    "lm_head.weight"):
+                continue
+            # MTP will be supported soon.
+            if "mtp" in name:
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+
+                if (("mlp.experts." in name) and name not in params_dict):
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+
+                    if weight_name not in name:
+                        continue
+
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    param = params_dict[name]
+
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 8aefbc206d80c..2d169ecf6bb46 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -53,6 +53,8 @@ _TEXT_GENERATION_MODELS = {
     "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"),
     "DeepseekV3ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),
     "Dots1ForCausalLM": ("dots1", "Dots1ForCausalLM"),
+    "Ernie4_5_ForCausalLM": ("ernie45", "Ernie4_5_ForCausalLM"),
+    "Ernie4_5_MoeForCausalLM": ("ernie45_moe", "Ernie4_5_MoeForCausalLM"),
     "ExaoneForCausalLM": ("exaone", "ExaoneForCausalLM"),
     "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
     "Fairseq2LlamaForCausalLM": ("fairseq2_llama", "Fairseq2LlamaForCausalLM"),

From 0c600b9ab60367c6a3d1451532d6bef6b69f64d4 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Wed, 2 Jul 2025 20:02:43 +0900
Subject: [PATCH 045/167] [Build/CI] Automatically tag DeepSeek related PRs
 (#20370)

Signed-off-by: Lu Fang <lufang@fb.com>
---
 .github/mergify.yml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 9c047bcaf95dc..af5553ab2c0b2 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -27,6 +27,22 @@ pull_request_rules:
       add:
         - ci/build
 
+- name: label-deepseek
+  description: Automatically apply deepseek label
+  conditions:
+    - or:
+      - files~=^examples/.*deepseek.*\.py
+      - files~=^tests/.*deepseek.*\.py
+      - files~=^vllm/entrypoints/openai/tool_parsers/.*deepseek.*\.py
+      - files~=^vllm/model_executor/models/.*deepseek.*\.py
+      - files~=^vllm/reasoning/.*deepseek.*\.py
+      - files~=^vllm/transformers_utils/.*deepseek.*\.py
+      - title~=(?i)DeepSeek
+  actions:
+    label:
+      add:
+        - deepseek
+
 - name: label-frontend
   description: Automatically apply frontend label
   conditions:

From 9e5552aa13908fcc4a4e4c22ac28529b196001f3 Mon Sep 17 00:00:00 2001
From: Joonchen Liau <86989360+kaln27@users.noreply.github.com>
Date: Wed, 2 Jul 2025 20:47:19 +0800
Subject: [PATCH 046/167] [NVIDIA] Support Cutlass w8a8 FP8 for Blackwell
 Geforce GPUs (sm120) (#17280)

Signed-off-by: kaln27 <liaojuncheng123@foxmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 CMakeLists.txt                                | 30 +++++++++
 .../cutlass_w8a8/c3x/scaled_mm.cuh            | 61 +++++++++++++++++
 .../cutlass_w8a8/c3x/scaled_mm_kernels.hpp    |  6 ++
 .../cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu   | 24 +++++++
 .../c3x/scaled_mm_sm120_fp8_dispatch.cuh      | 67 +++++++++++++++++++
 .../cutlass_w8a8/scaled_mm_c3x_sm120.cu       | 34 ++++++++++
 .../cutlass_w8a8/scaled_mm_entry.cu           | 17 ++++-
 7 files changed, 238 insertions(+), 1 deletion(-)
 create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu
 create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8_dispatch.cuh
 create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f6f8d59d28aef..9b1daeeed83ec 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -420,6 +420,36 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     endif()
   endif()
 
+
+  # The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
+  # CUDA 12.8 or later
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
+    set(SRCS
+      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
+      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
+    )
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM120=1")
+    # Let scaled_mm_c2x know it doesn't need to build these arches
+    list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
+    message(STATUS "Building scaled_mm_c3x_sm120 for archs: ${SCALED_MM_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
+      message(STATUS "Not building scaled_mm_c3x_sm120 as CUDA Compiler version is "
+                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or "
+                     "later if you intend on running FP8 quantized models on "
+                     "Blackwell.")
+    else()
+      message(STATUS "Not building scaled_mm_c3x_120 as no compatible archs found "
+                     "in CUDA target architectures")
+    endif()
+  endif()
+
+
   # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
   # require CUDA 12.8 or later
   cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
index 8f4df836bcc8d..2387ec57e8f2d 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
@@ -144,4 +144,65 @@ struct cutlass_3x_gemm_sm100 {
       Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
 };
 
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule>
+struct cutlass_3x_gemm_sm120 {
+  using ElementAB = ElementAB_;
+  using LayoutA = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+
+  using LayoutB = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+
+  using ElementC = void;
+  using LayoutC = cutlass::layout::RowMajor;
+  static constexpr int AlignmentC =
+      128 / cutlass::sizeof_bits<ElementD_>::value;
+
+  using ElementD = ElementD_;
+  using LayoutD = cutlass::layout::RowMajor;
+  static constexpr int AlignmentD = AlignmentC;
+
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+  using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;
+
+  // MMA type
+  using ElementAccumulator = float;
+
+  // Epilogue types
+  using ElementBias = cutlass::half_t;
+  using ElementCompute = float;
+  using ElementAux = ElementD;
+  using LayoutAux = LayoutD;
+  using ElementAmax = float;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAccumulator, ElementCompute, ElementC, LayoutC, AlignmentC,
+          ElementD, LayoutD, AlignmentD, EpilogueSchedule,
+          EVTCompute>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp, ElementAB,
+          LayoutA, AlignmentA, ElementAB, LayoutB, AlignmentB,
+          ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+};
+
 }  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
index c1242fdb39da9..e049a5f2d2c9a 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
@@ -36,6 +36,12 @@ void cutlass_scaled_mm_sm100_fp8(torch::Tensor& out, torch::Tensor const& a,
                                  torch::Tensor const& b_scales,
                                  std::optional<torch::Tensor> const& bias);
 
+void cutlass_scaled_mm_sm120_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias);
+
 void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out,
                                            torch::Tensor const& a,
                                            torch::Tensor const& b,
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu
new file mode 100644
index 0000000000000..bc816cbdf86e5
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu
@@ -0,0 +1,24 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_sm120_fp8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_sm120_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm120_fp8_epilogue<c3x::ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm120_fp8_epilogue<c3x::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8_dispatch.cuh
new file mode 100644
index 0000000000000..c31f96bf7c0e2
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8_dispatch.cuh
@@ -0,0 +1,67 @@
+#pragma once
+
+#include "scaled_mm.cuh"
+#include "cutlass_gemm_caller.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM120 (fp8) based on the
+ * Gemm shape.
+ */
+
+namespace vllm {
+
+using c3x::cutlass_gemm_caller;
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm120_fp8_config_default {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_1, _1, _1>;  // Only work with Shape<_1, _1, _1>
+  using Cutlass3xGemm =
+      cutlass_3x_gemm_sm120<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm120_fp8_dispatch(torch::Tensor& out,
+                                            torch::Tensor const& a,
+                                            torch::Tensor const& b,
+                                            EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmDefault =
+      typename sm120_fp8_config_default<InType, OutType,
+                                        Epilogue>::Cutlass3xGemm;
+  return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+      out, a, b, std::forward<EpilogueArgs>(args)...);
+}
+
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm120_fp8_epilogue(torch::Tensor& out,
+                                          torch::Tensor const& a,
+                                          torch::Tensor const& b,
+                                          EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  if (out.dtype() == torch::kBFloat16) {
+    return cutlass_gemm_sm120_fp8_dispatch<cutlass::float_e4m3_t,
+                                           cutlass::bfloat16_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    return cutlass_gemm_sm120_fp8_dispatch<cutlass::float_e4m3_t,
+                                           cutlass::half_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
+}  // namespace vllm
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu
new file mode 100644
index 0000000000000..0c47ab82991dc
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu
@@ -0,0 +1,34 @@
+#include <cudaTypedefs.h>
+#include "c3x/scaled_mm_kernels.hpp"
+
+#include "cuda_utils.h"
+
+/*
+   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
+   NVIDIA GPUs with sm120 (Blackwell Geforce).
+*/
+
+#if defined ENABLE_SCALED_MM_SM120 && ENABLE_SCALED_MM_SM120
+
+void cutlass_scaled_mm_sm120(torch::Tensor& c, torch::Tensor const& a,
+                             torch::Tensor const& b,
+                             torch::Tensor const& a_scales,
+                             torch::Tensor const& b_scales,
+                             std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  int M = a.size(0), N = b.size(1), K = a.size(1);
+  TORCH_CHECK(
+      (a_scales.numel() == 1 || a_scales.numel() == a.size(0)) &&
+          (b_scales.numel() == 1 || b_scales.numel() == b.size(1)),
+      "Currently, block scaled fp8 gemm is not implemented for Blackwell");
+
+  // Standard per-tensor/per-token/per-channel scaling
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn,
+              "Currently, only fp8 gemm is implemented for Blackwell");
+  vllm::cutlass_scaled_mm_sm120_fp8(c, a, b, a_scales, b_scales, bias);
+}
+
+#endif
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index a2080c3001190..31b60488dfb77 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -41,6 +41,14 @@ void cutlass_moe_mm_sm90(
 
 #endif
 
+#if defined ENABLE_SCALED_MM_SM120 && ENABLE_SCALED_MM_SM120
+void cutlass_scaled_mm_sm120(torch::Tensor& c, torch::Tensor const& a,
+                             torch::Tensor const& b,
+                             torch::Tensor const& a_scales,
+                             torch::Tensor const& b_scales,
+                             std::optional<torch::Tensor> const& bias);
+#endif
+
 #if defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM100
 void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
                              torch::Tensor const& b,
@@ -168,8 +176,15 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
   at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
   int32_t version_num = get_sm_version_num();
 
+#if defined ENABLE_SCALED_MM_SM120 && ENABLE_SCALED_MM_SM120
+  if (version_num >= 120) {
+    cutlass_scaled_mm_sm120(c, a, b, a_scales, b_scales, bias);
+    return;
+  }
+#endif
+
 #if defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM100
-  if (version_num >= 100) {
+  if (version_num >= 100 && version_num < 120) {
     cutlass_scaled_mm_sm100(c, a, b, a_scales, b_scales, bias);
     return;
   }

From ccbfb1d1c96352e9022093c6d362423f1fc56918 Mon Sep 17 00:00:00 2001
From: WangHuaqiang <huaqiang.wang@intel.com>
Date: Wed, 2 Jul 2025 20:53:36 +0800
Subject: [PATCH 047/167] [Bugfix] Fix the max_seq_len limit of 16384 for
 DeepSeek models (#20322)

Signed-off-by: Wang Huaqiang <huaqiang.wang@intel.com>
---
 tests/test_config.py |  2 ++
 vllm/config.py       | 15 +++++++++++----
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/tests/test_config.py b/tests/test_config.py
index cb7654c26afc8..2e0222fa67a46 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -412,6 +412,8 @@ def test_load_config_pt_load_map_location(pt_load_map_location):
         ("BAAI/bge-reranker-base", None, 512, False),
         ("BAAI/bge-reranker-base", 256, 256, False),
         ("BAAI/bge-reranker-base", 513, 512, True),
+        ("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", None, 131072, False),
+        ("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", 131073, 131072, True),
     ])
 def test_get_and_verify_max_len(model_id, max_model_len, expected_max_len,
                                 should_raise):
diff --git a/vllm/config.py b/vllm/config.py
index 74e7ed2d48747..7863859a6ee67 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1442,10 +1442,17 @@ class ModelConfig:
         return getattr(self.hf_config, "matryoshka_dimensions", None)
 
     def get_and_verify_max_len(self, max_model_len: int):
-        tokenizer_config = try_get_tokenizer_config(
-            self.tokenizer,
-            trust_remote_code=self.trust_remote_code,
-            revision=self.tokenizer_revision)
+        # For pooling models, the tokenizer's `model_max_length` is often a
+        # reliable source for the maximum sequence length. However, for
+        # generative models, this can be incorrect and unduly limit the
+        # context window (e.g., DeepSeek-R1). Therefore, we only consider
+        # tokenizer_config for pooling models.
+        tokenizer_config = None
+        if self.runner_type == "pooling":
+            tokenizer_config = try_get_tokenizer_config(
+                self.tokenizer,
+                trust_remote_code=self.trust_remote_code,
+                revision=self.tokenizer_revision)
         max_model_len = _get_and_verify_max_len(
             hf_config=self.hf_text_config,
             tokenizer_config=tokenizer_config,

From 706ff1322468e8c445fe166a6bdc3136fba4698a Mon Sep 17 00:00:00 2001
From: zichongli5 <53316944+zichongli5@users.noreply.github.com>
Date: Wed, 2 Jul 2025 05:54:12 -0700
Subject: [PATCH 048/167] [Model] Adds support for SlimMoE models
 Phi-tiny-MoE-instruct (#20286)

Signed-off-by: Zichong Li <t-lizichong@microsoft.com@Reasoning-H100-VM3.drbuo4tcjzruhloch3eo0b25ef.cx.internal.cloudapp.net>
Co-authored-by: Zichong Li <t-lizichong@microsoft.com@Reasoning-H100-VM3.drbuo4tcjzruhloch3eo0b25ef.cx.internal.cloudapp.net>
Co-authored-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/phimoe.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index cdb7e0d18d51e..2ab4edc18ccf2 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -68,6 +68,7 @@ class PhiMoEConfig(PretrainedConfig):
         num_hidden_layers=32,
         num_attention_heads=32,
         num_key_value_heads=8,
+        head_dim=None,
         hidden_act="silu",
         max_position_embeddings=4096 * 32,
         initializer_range=0.02,
@@ -101,8 +102,11 @@ class PhiMoEConfig(PretrainedConfig):
         # for backward compatibility
         if num_key_value_heads is None:
             num_key_value_heads = num_attention_heads
+        if head_dim is None:
+            head_dim = hidden_size // num_attention_heads
 
         self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
@@ -294,6 +298,7 @@ class PhiMoEAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
+        head_dim: Optional[int] = None,
         max_position: int = 4096 * 32,
         rope_theta: float = 10000,
         cache_config: Optional[CacheConfig] = None,
@@ -317,7 +322,9 @@ class PhiMoEAttention(nn.Module):
             # the KV heads across multiple tensor parallel GPUs.
             assert tp_size % self.total_num_kv_heads == 0
         self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        self.head_dim = hidden_size // self.total_num_heads
+        if head_dim is None:
+            head_dim = hidden_size // num_heads
+        self.head_dim = head_dim
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
@@ -387,6 +394,8 @@ class PhiMoEDecoderLayer(nn.Module):
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
+            head_dim=getattr(config, "head_dim",
+                             self.hidden_size // config.num_attention_heads),
             rope_theta=rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,

From b95877509b86653caafe31da9c6aa480233725c0 Mon Sep 17 00:00:00 2001
From: cronoik-inceptionai <markus.schaffrath@inceptionai.ai>
Date: Wed, 2 Jul 2025 16:55:49 +0400
Subject: [PATCH 049/167] Documentation update tool_calling: mapping back to
 function from response (#20373)

---
 docs/features/tool_calling.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 2aeb022d25745..29f4b2300bbef 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -53,7 +53,7 @@ Next, make a request to the model that should result in it using the available t
     tool_call = response.choices[0].message.tool_calls[0].function
     print(f"Function called: {tool_call.name}")
     print(f"Arguments: {tool_call.arguments}")
-    print(f"Result: {get_weather(**json.loads(tool_call.arguments))}")
+    print(f"Result: {tool_functions[tool_call.name](**json.loads(tool_call.arguments))}")
     ```
 
 Example output:

From c1909e7e8ccd2037e76536a8e726120c85d3754e Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Wed, 2 Jul 2025 09:08:27 -0400
Subject: [PATCH 050/167] [Kernels] MoE refactor (#19636)

Signed-off-by: Bill Nell <bnell@redhat.com>
Signed-off-by: ElizaWszola <ewszola@redhat.com>
Co-authored-by: ElizaWszola <ewszola@redhat.com>
---
 .../kernels/benchmark_grouped_gemm_cutlass.py |  11 +-
 tests/kernels/moe/parallel_utils.py           | 190 ++++++++
 tests/kernels/moe/test_batched_moe.py         | 253 +++++++++--
 tests/kernels/moe/test_block_fp8.py           | 296 +++++++++++++
 tests/kernels/moe/test_block_int8.py          | 147 +++++++
 tests/kernels/moe/test_cutlass_moe.py         |  22 +-
 tests/kernels/moe/test_deepep_deepgemm_moe.py | 102 ++---
 tests/kernels/moe/test_deepep_moe.py          | 106 +++--
 tests/kernels/moe/test_moe.py                 |  14 +-
 tests/kernels/moe/test_nvfp4_moe.py           |   2 +-
 tests/kernels/moe/test_pplx_cutlass_moe.py    |   6 +-
 tests/kernels/moe/test_pplx_moe.py            | 145 ++++---
 tests/kernels/moe/utils.py                    | 407 +++++++++--------
 tests/kernels/quant_utils.py                  | 174 +++++++-
 tests/kernels/quantization/test_block_fp8.py  | 371 +---------------
 tests/kernels/quantization/test_block_int8.py | 136 +-----
 tests/kernels/utils.py                        |  80 +++-
 vllm/_custom_ops.py                           |   2 +-
 .../layers/fused_moe/__init__.py              |  26 +-
 .../layers/fused_moe/batched_deep_gemm_moe.py |  32 +-
 .../batched_triton_or_deep_gemm_moe.py        |  65 ++-
 .../model_executor/layers/fused_moe/config.py | 410 ++++++++++++++++++
 .../layers/fused_moe/cutlass_moe.py           |  73 ++--
 .../layers/fused_moe/deep_gemm_moe.py         |  29 +-
 .../fused_moe/deepep_ht_prepare_finalize.py   |  71 +--
 .../fused_moe/deepep_ll_prepare_finalize.py   |  71 ++-
 .../layers/fused_moe/fused_batched_moe.py     | 131 ++++--
 .../layers/fused_moe/fused_moe.py             |  97 ++---
 vllm/model_executor/layers/fused_moe/layer.py | 384 ++++------------
 .../layers/fused_moe/modular_kernel.py        |  87 +++-
 .../layers/fused_moe/pplx_prepare_finalize.py | 107 +++--
 .../layers/fused_moe/prepare_finalize.py      |  24 +-
 .../layers/fused_moe/triton_deep_gemm_moe.py  |  68 ++-
 vllm/model_executor/layers/fused_moe/utils.py |  29 +-
 .../compressed_tensors_moe.py                 |  55 ++-
 .../model_executor/layers/quantization/fp8.py |  59 +--
 36 files changed, 2698 insertions(+), 1584 deletions(-)
 create mode 100644 tests/kernels/moe/parallel_utils.py
 create mode 100644 tests/kernels/moe/test_block_fp8.py
 create mode 100644 tests/kernels/moe/test_block_int8.py
 create mode 100644 vllm/model_executor/layers/fused_moe/config.py

diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
index acabe6c1ddb0a..1d4e730f99ae9 100644
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -113,6 +113,7 @@ def bench_run(
         w2_scale: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
+        per_act_token: bool,
         num_repeats: int,
     ):
         for _ in range(num_repeats):
@@ -124,7 +125,8 @@ def bench_run(
                 topk_ids,
                 w1_scale,
                 w2_scale,
-                a1_scale=a_scale,
+                per_act_token,
+                a1_scale=None,
             )
 
     def run_cutlass_from_graph(
@@ -148,7 +150,8 @@ def bench_run(
                 topk_ids,
                 w1_scale,
                 w2_scale,
-                a1_scale=a_scale,
+                per_act_token,
+                a1_scale=None,
             )
 
     def run_triton_from_graph(
@@ -227,6 +230,7 @@ def bench_run(
         "w2_q": w2_q,
         "w1_scale": w1_scale,
         "w2_scale": w2_scale,
+        "per_act_token": per_act_token,
         # cuda graph params
         "cutlass_graph": cutlass_graph,
         "triton_graph": triton_graph,
@@ -287,12 +291,13 @@ def bench_run(
         w2_scale,
         topk_weights,
         topk_ids,
+        per_act_token,
         num_warmup,
     )
 
     results.append(
         benchmark.Timer(
-            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, num_runs)",  # noqa: E501
+            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, per_act_token, num_runs)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
diff --git a/tests/kernels/moe/parallel_utils.py b/tests/kernels/moe/parallel_utils.py
new file mode 100644
index 0000000000000..7797e4f0c9c04
--- /dev/null
+++ b/tests/kernels/moe/parallel_utils.py
@@ -0,0 +1,190 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+DeepEP test utilities
+"""
+import dataclasses
+import importlib
+import os
+import traceback
+from typing import Callable, Optional
+
+import torch
+from torch.distributed import ProcessGroup
+from torch.multiprocessing import (
+    spawn)  # pyright: ignore[reportPrivateImportUsage]
+from typing_extensions import Concatenate, ParamSpec
+
+from vllm.utils import get_open_port
+
+has_deep_ep = importlib.util.find_spec("deep_ep") is not None
+if has_deep_ep:
+    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
+        DeepEPHTPrepareAndFinalize)
+    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
+        DeepEPLLPrepareAndFinalize)
+
+## Parallel Processes Utils
+
+P = ParamSpec("P")
+
+
+@dataclasses.dataclass
+class ProcessGroupInfo:
+    world_size: int
+    world_local_size: int
+    rank: int
+    node_rank: int
+    local_rank: int
+    device: torch.device
+
+
+def _worker_parallel_launch(
+    local_rank: int,
+    world_size: int,
+    world_local_size: int,
+    node_rank: int,
+    init_method: str,
+    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> None:
+    rank = node_rank * world_local_size + local_rank
+    torch.cuda.set_device(local_rank)
+    device = torch.device("cuda", local_rank)
+    torch.distributed.init_process_group(
+        backend="cpu:gloo,cuda:nccl",
+        init_method=init_method,
+        rank=rank,
+        world_size=world_size,
+        device_id=device,
+    )
+    barrier = torch.tensor([rank], device=device)
+    torch.distributed.all_reduce(barrier)
+
+    try:
+        worker(
+            ProcessGroupInfo(
+                world_size=world_size,
+                world_local_size=world_local_size,
+                rank=rank,
+                node_rank=node_rank,
+                local_rank=local_rank,
+                device=device,
+            ),
+            *args,
+            **kwargs,
+        )
+    except Exception as ex:
+        print(ex)
+        traceback.print_exc()
+        raise
+    finally:
+        torch.distributed.destroy_process_group()
+
+
+def parallel_launch(
+    world_size: int,
+    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> None:
+    assert not kwargs
+    spawn(
+        _worker_parallel_launch,
+        args=(
+            world_size,
+            world_size,
+            0,
+            f"tcp://{os.getenv('LOCALHOST', 'localhost')}:{get_open_port()}",
+            worker,
+        ) + args,
+        nprocs=world_size,
+        join=True,
+    )
+
+
+## DeepEP specific utils
+
+
+@dataclasses.dataclass
+class DeepEPHTArgs:
+    num_local_experts: int
+
+
+@dataclasses.dataclass
+class DeepEPLLArgs:
+    max_tokens_per_rank: int
+    hidden_size: int
+    num_experts: int
+    use_fp8_dispatch: bool
+
+
+def make_deepep_ht_a2a(pg: ProcessGroup,
+                       pgi: ProcessGroupInfo,
+                       dp_size: int,
+                       ht_args: DeepEPHTArgs,
+                       q_dtype: Optional[torch.dtype] = None,
+                       block_shape: Optional[list[int]] = None):
+
+    import deep_ep
+
+    # high throughput a2a
+    num_nvl_bytes = 1024 * 1024 * 1024  # 1GB
+    num_rdma_bytes, low_latency_mode, num_qps_per_rank = 0, False, 1
+    buffer = deep_ep.Buffer(group=pg,
+                            num_nvl_bytes=num_nvl_bytes,
+                            num_rdma_bytes=num_rdma_bytes,
+                            low_latency_mode=low_latency_mode,
+                            num_qps_per_rank=num_qps_per_rank)
+    return DeepEPHTPrepareAndFinalize(buffer=buffer,
+                                      world_size=pgi.world_size,
+                                      rank=pgi.rank,
+                                      dp_size=dp_size,
+                                      rank_expert_offset=pgi.rank *
+                                      ht_args.num_local_experts)
+
+
+def make_deepep_ll_a2a(pg: ProcessGroup,
+                       pgi: ProcessGroupInfo,
+                       dp_size: int,
+                       deepep_ll_args: DeepEPLLArgs,
+                       q_dtype: Optional[torch.dtype] = None,
+                       block_shape: Optional[list[int]] = None):
+
+    import deep_ep
+
+    # low-latency a2a
+    num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
+        deepep_ll_args.max_tokens_per_rank, deepep_ll_args.hidden_size,
+        pgi.world_size, deepep_ll_args.num_experts)
+
+    buffer = deep_ep.Buffer(group=pg,
+                            num_rdma_bytes=num_rdma_bytes,
+                            low_latency_mode=True,
+                            num_qps_per_rank=deepep_ll_args.num_experts //
+                            pgi.world_size)
+
+    return DeepEPLLPrepareAndFinalize(
+        buffer=buffer,
+        world_size=pgi.world_size,
+        dp_size=dp_size,
+        max_tokens_per_rank=deepep_ll_args.max_tokens_per_rank,
+        use_fp8_dispatch=deepep_ll_args.use_fp8_dispatch,
+    )
+
+
+def make_deepep_a2a(pg: ProcessGroup,
+                    pgi: ProcessGroupInfo,
+                    dp_size: int,
+                    deepep_ht_args: Optional[DeepEPHTArgs],
+                    deepep_ll_args: Optional[DeepEPLLArgs],
+                    q_dtype: Optional[torch.dtype] = None,
+                    block_shape: Optional[list[int]] = None):
+    if deepep_ht_args is not None:
+        assert deepep_ll_args is None
+        return make_deepep_ht_a2a(pg, pgi, dp_size, deepep_ht_args, q_dtype,
+                                  block_shape)
+
+    assert deepep_ll_args is not None
+    return make_deepep_ll_a2a(pg, pgi, dp_size, deepep_ll_args, q_dtype,
+                              block_shape)
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
index b0e0feab4689b..779fa1df086d4 100644
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -2,18 +2,59 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
+from typing import Optional
 
 import pytest
 import torch
 import triton.language as tl
 
+from tests.kernels.moe.utils import (batched_moe,
+                                     make_quantized_test_activations,
+                                     make_test_weights, triton_moe)
+from tests.kernels.quant_utils import native_batched_masked_quant_matmul
+from tests.kernels.utils import torch_experts
+from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
     invoke_moe_batched_triton_kernel)
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+from vllm.platforms import current_platform
+
+MNK_FACTORS = [
+    (1, 128, 128),
+    (1, 128, 2048),
+    (1, 512, 512),
+    (1, 1024, 128),
+    (1, 1024, 2048),
+    (32, 128, 128),
+    (32, 512, 512),
+    (32, 1024, 2048),
+    (45, 128, 128),
+    (45, 128, 2048),
+    (45, 512, 512),
+    (45, 1024, 128),
+    (45, 1024, 2048),
+    (64, 128, 128),
+    (64, 512, 512),
+    (64, 1024, 2048),
+    (222, 128, 128),
+    (222, 128, 2048),
+    (222, 512, 512),
+    (222, 1024, 128),
+    (222, 1024, 2048),
+]
+NUM_EXPERTS = [8, 64]
+TOP_KS = [1, 2, 6]
+
+vllm_config = VllmConfig()
+vllm_config.scheduler_config.max_num_seqs = 128
+vllm_config.scheduler_config.max_model_len = 8192
 
 
 @dataclass
 class BatchedMMConfig:
-    dtype: torch.dtype
+    in_dtype: torch.dtype
+    quant_dtype: Optional[torch.dtype]
+    out_dtype: torch.dtype
     num_experts: int
     max_tokens_per_expert: int
     K: int
@@ -32,79 +73,127 @@ class BatchedMMTensors:
         A = torch.randn(
             (config.num_experts, config.max_tokens_per_expert, config.K),
             device="cuda",
-            dtype=config.dtype) / 10
+            dtype=config.in_dtype) / 10
         B = torch.randn((config.num_experts, config.N, config.K),
                         device="cuda",
-                        dtype=config.dtype)
+                        dtype=config.in_dtype)
         C = torch.zeros(
             (config.num_experts, config.max_tokens_per_expert, config.N),
             device="cuda",
-            dtype=config.dtype)
+            dtype=config.out_dtype)
+
         num_expert_tokens = torch.randint(low=0,
                                           high=config.max_tokens_per_expert,
                                           size=(config.num_experts, ),
                                           device="cuda",
                                           dtype=torch.int32)
+
         return BatchedMMTensors(A, B, C, num_expert_tokens)
 
 
-def ref_impl(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
-             num_expert_tokens: torch.Tensor) -> torch.Tensor:
-
-    num_expert_tokens_cpu = num_expert_tokens.clone()
-    num_expert_tokens_cpu = num_expert_tokens_cpu.to(device="cpu")
-    num_experts = num_expert_tokens.size(0)
-
-    for e in range(num_experts):
-        num_tokens = num_expert_tokens_cpu[e]
-        C[e, :num_tokens, :] = A[e, :num_tokens, :] @ B[e].transpose(0, 1)
-
-    return C
-
-
-@pytest.mark.parametrize("num_experts", [16, 32])
+@pytest.mark.parametrize("num_experts", [8, 16, 32])
 @pytest.mark.parametrize("max_tokens_per_expert",
                          [32, 64, 128, 192, 224, 256, 512])
 @pytest.mark.parametrize("K", [128, 256, 1024])
 @pytest.mark.parametrize("N", [128, 256, 512, 1024])
 @pytest.mark.parametrize("dtype",
                          [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("block_shape", [None])
+@pytest.mark.parametrize("per_act_token_quant", [False])
 def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
-                    N: int, dtype: torch.dtype):
+                    N: int, dtype: torch.dtype,
+                    block_shape: Optional[list[int]],
+                    per_act_token_quant: bool):
+    current_platform.seed_everything(7)
 
-    config = BatchedMMConfig(dtype, num_experts, max_tokens_per_expert, K, N)
-    tensors = BatchedMMTensors.make_tensors(config)
+    use_fp8_w8a8 = dtype == torch.float8_e4m3fn
 
-    test_output = tensors.C
-    ref_output = test_output.clone()
+    if (per_act_token_quant or block_shape is not None) and not use_fp8_w8a8:
+        pytest.skip("Don't test blocking for non-quantized types.")
+
+    if per_act_token_quant and block_shape is not None:
+        pytest.skip("Skip illegal quantization test.")
+
+    if dtype.itemsize == 1:
+        act_dtype = torch.bfloat16
+        quant_dtype = dtype
+    else:
+        act_dtype = dtype
+        quant_dtype = None
+
+    num_expert_tokens = torch.randint(low=0,
+                                      high=max_tokens_per_expert,
+                                      size=(num_experts, ),
+                                      device="cuda",
+                                      dtype=torch.int32)
+
+    A, A_q, A_scale = make_quantized_test_activations(
+        num_experts,
+        max_tokens_per_expert,
+        K,
+        in_dtype=act_dtype,
+        quant_dtype=quant_dtype,
+        block_shape=block_shape,
+        per_act_token_quant=per_act_token_quant)
+
+    B, B_q, B_scale, _, _, _ = make_test_weights(
+        num_experts,
+        N // 2,
+        K,
+        in_dtype=act_dtype,
+        quant_dtype=quant_dtype,
+        block_shape=block_shape,
+    )
+
+    out_shape = (num_experts, max_tokens_per_expert, N)
+    test_output = torch.zeros(out_shape, dtype=act_dtype, device="cuda")
+    ref_output = torch.zeros(out_shape, dtype=act_dtype, device="cuda")
+    q_ref_output = torch.zeros(out_shape, dtype=act_dtype, device="cuda")
 
     compute_tl_dtype = {
         torch.float16: tl.float16,
         torch.bfloat16: tl.bfloat16,
         torch.float32: tl.float32
     }[test_output.dtype]
+
+    assert A_q.dtype == B_q.dtype
+
     invoke_moe_batched_triton_kernel(
-        tensors.A,
-        tensors.B,
+        A_q,
+        B_q,
         test_output,
-        tensors.num_expert_tokens,
+        num_expert_tokens,
         compute_tl_dtype,
         # Quantization data
-        None,
-        None,
+        A_scale,
+        B_scale,
         None,
         # Quantization schemes
-        False,
+        use_fp8_w8a8,
         False,
         False,
         config={
             "BLOCK_SIZE_M": 16,
             "BLOCK_SIZE_N": 16,
-            "BLOCK_SIZE_K": 16
-        })
+            "BLOCK_SIZE_K": 16 if dtype.itemsize > 1 else 32
+        },
+        block_shape=block_shape,
+    )
 
-    ref_output = ref_impl(tensors.A, tensors.B, ref_output,
-                          tensors.num_expert_tokens)
+    ref_output = native_batched_masked_quant_matmul(
+        A,
+        B,
+        ref_output,
+        num_expert_tokens,
+        None,
+        None,
+        None,
+    )
+
+    q_ref_output = native_batched_masked_quant_matmul(A_q, B_q, q_ref_output,
+                                                      num_expert_tokens,
+                                                      A_scale, B_scale,
+                                                      block_shape)
 
     rtol, atol = {
         torch.float16: (6e-2, 6e-2),
@@ -112,4 +201,98 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         torch.float32: (1e-2, 1e-2),
     }[test_output.dtype]
 
-    torch.testing.assert_close(test_output, ref_output, atol=atol, rtol=rtol)
+    torch.testing.assert_close(ref_output, test_output, atol=atol, rtol=rtol)
+    torch.testing.assert_close(test_output, q_ref_output, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize(("m", "n", "k"), MNK_FACTORS)
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("per_act_token_quant", [False])
+@pytest.mark.parametrize("block_shape", [None])
+def test_fused_moe_batched_experts(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    dtype: torch.dtype,
+    per_act_token_quant: bool,
+    block_shape: Optional[list[int]],
+):
+    current_platform.seed_everything(7)
+
+    use_fp8_w8a8 = dtype == torch.float8_e4m3fn
+
+    if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
+        pytest.skip("Skip quantization test for non-quantized type")
+
+    if per_act_token_quant and block_shape is not None or topk > e:
+        pytest.skip("Skip illegal quantization test.")
+
+    a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
+    score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
+
+    if dtype.itemsize == 1:
+        act_dtype = torch.bfloat16
+        quant_dtype = dtype
+    else:
+        act_dtype = dtype
+        quant_dtype = None
+
+    _, w1, w1_s, _, w2, w2_s = make_test_weights(e,
+                                                 n,
+                                                 k,
+                                                 block_shape=block_shape,
+                                                 in_dtype=act_dtype,
+                                                 quant_dtype=quant_dtype)
+
+    with set_current_vllm_config(vllm_config):
+        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
+        batched_output = batched_moe(
+            a,
+            w1,
+            w2,
+            topk_weight,
+            topk_ids,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+            quant_dtype=quant_dtype,
+            per_act_token_quant=per_act_token_quant,
+            block_shape=block_shape,
+        )
+        baseline_output = torch_experts(
+            a,
+            w1,
+            w2,
+            topk_weight,
+            topk_ids,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+            quant_dtype=quant_dtype,
+            per_act_token_quant=per_act_token_quant,
+            block_shape=block_shape)
+
+        triton_output = triton_moe(
+            a,
+            w1,
+            w2,
+            topk_weight,
+            topk_ids,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+            quant_dtype=quant_dtype,
+            per_act_token_quant=per_act_token_quant,
+            block_shape=block_shape,
+        )
+
+    torch.testing.assert_close(triton_output,
+                               baseline_output,
+                               atol=2e-2,
+                               rtol=2e-2)
+
+    torch.testing.assert_close(triton_output,
+                               batched_output,
+                               atol=2e-2,
+                               rtol=2e-2)
diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py
new file mode 100644
index 0000000000000..c187542205a57
--- /dev/null
+++ b/tests/kernels/moe/test_block_fp8.py
@@ -0,0 +1,296 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.kernels.moe.utils import make_test_weights
+from tests.kernels.quant_utils import (native_per_token_group_quant_fp8,
+                                       native_w8a8_block_matmul)
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import fused_experts
+from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+    _valid_deep_gemm_shape, deep_gemm_moe_fp8)
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    fused_topk, modular_triton_fused_moe)
+from vllm.platforms import current_platform
+
+dg_available = False
+try:
+    import deep_gemm
+    dg_available = True
+except ImportError:
+    pass
+
+if current_platform.get_device_capability() < (9, 0):
+    pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
+                allow_module_level=True)
+
+vllm_config = VllmConfig()
+vllm_config.scheduler_config.max_num_seqs = 128
+vllm_config.scheduler_config.max_model_len = 8192
+
+# Test configurations
+DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
+# Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
+# and its hidden size is 7168.
+MNK_FACTORS = [
+    (1, 128, 128),
+    (1, 512, 512),
+    (1, 128, 7168),
+    (1, 1024, 7168),
+    (1, 4608, 128),
+    (1, 4608, 512),
+    (1, 4608, 7168),
+    (83, 128, 128),
+    (83, 512, 512),
+    (83, 1024, 7168),
+    (83, 4608, 512),
+    (83, 4608, 7168),
+    (128, 128, 128),
+    (128, 512, 512),
+    (128, 1024, 7168),
+    (128, 4608, 512),
+    (128, 4608, 7168),
+    (2048, 128, 128),
+    (2048, 1024, 7168),
+    (2048, 4608, 512),
+    (2048, 4608, 7168),
+    (8192, 128, 128),
+    (8192, 512, 512),
+    (8192, 128, 7168),
+    (8192, 1024, 7168),
+    (8192, 4608, 512),
+    (8192, 4608, 7168),
+]
+
+MNK_FACTORS_DG = [
+    (128, 128, 128),
+    (128, 512, 512),
+    (128, 128, 7168),
+    (128, 1024, 7168),
+    (128, 4608, 128),
+    (128, 4608, 512),
+    (128, 4608, 7168),
+    (192, 128, 128),
+    (192, 512, 512),
+    (192, 1024, 7168),
+    (192, 4608, 512),
+    (192, 4608, 7168),
+    (1335, 128, 128),
+    (1335, 1024, 7168),
+    (1335, 4608, 512),
+    (1335, 4608, 7168),
+    (2048, 128, 128),
+    (2048, 512, 512),
+    (2048, 128, 7168),
+    (2048, 1024, 7168),
+    (2048, 4608, 128),
+    (2048, 4608, 512),
+    (2048, 4608, 7168),
+]
+
+BLOCK_SIZE = [[128, 128]]
+E = [2, 8, 16]  # [128, 256]
+TOP_KS = [1, 2, 6]
+SEEDS = [0]
+
+
+def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, topk_weight, topk_ids,
+                             block_shape):
+    """Fused moe with block-wise quantization using native torch."""
+    B, D = a.shape
+    topk = topk_ids.size(1)
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+
+    _, block_k = block_shape[0], block_shape[1]
+    a_q, a_s = native_per_token_group_quant_fp8(a, block_k)
+    a_q = a_q.to(torch.float32)
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            inter_out = native_w8a8_block_matmul(a_q[mask],
+                                                 w1[i],
+                                                 a_s[mask],
+                                                 w1_s[i],
+                                                 block_shape,
+                                                 output_dtype=a.dtype)
+            act_out = SiluAndMul().forward_native(inter_out)
+            act_out_q, act_out_s = native_per_token_group_quant_fp8(
+                act_out, block_k)
+            out[mask] = native_w8a8_block_matmul(act_out_q,
+                                                 w2[i],
+                                                 act_out_s,
+                                                 w2_s[i],
+                                                 block_shape,
+                                                 output_dtype=a.dtype)
+    return (out.view(B, -1, w2.shape[1]) *
+            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+
+
+# Skip all tests if CUDA is not available
+pytest.importorskip("torch.cuda")
+
+
+@pytest.fixture(autouse=True)
+def setup_cuda():
+    torch.set_default_device("cuda")
+
+
+@pytest.mark.parametrize(("M", "N", "K"), MNK_FACTORS)
+@pytest.mark.parametrize("E", E)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("block_size", BLOCK_SIZE)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@torch.inference_mode()
+def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed,
+                                  monkeypatch):
+    if topk > E:
+        pytest.skip(f"Skipping test; topk={topk} > E={E}")
+
+    torch.manual_seed(seed)
+
+    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "2048")
+
+    a = torch.randn((M, K), dtype=dtype) / 10
+    score = torch.randn((M, E), dtype=dtype)
+
+    _, w1, w1_s, _, w2, w2_s = make_test_weights(E,
+                                                 N,
+                                                 K,
+                                                 dtype,
+                                                 torch.float8_e4m3fn,
+                                                 per_act_token_quant=False,
+                                                 block_shape=block_size)
+
+    m_fused_moe = modular_triton_fused_moe(use_fp8_w8a8=True,
+                                           use_int8_w8a8=False,
+                                           use_int8_w8a16=False,
+                                           use_int4_w4a16=False,
+                                           per_act_token_quant=False,
+                                           block_shape=block_size)
+
+    topk_weights, topk_ids, _ = fused_topk(a, score.float(), topk, False)
+
+    # Set the context to avoid lots of warning spam.
+    with set_current_vllm_config(vllm_config):
+        ref_out = torch_w8a8_block_fp8_moe(
+            a,
+            w1,
+            w2,
+            w1_s,
+            w2_s,
+            topk_weights,
+            topk_ids,
+            block_size,
+        )
+
+        out = fused_experts(
+            a,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            use_fp8_w8a8=True,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+            block_shape=block_size,
+        )
+
+        m_out = m_fused_moe(
+            a,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+        )
+
+    # 0.039 only needed for [40000-4608-7168-2-1-block_size852-dtype852-0]
+    tol = 0.035 if M < 40000 else 0.039
+    torch.testing.assert_close(out, ref_out, atol=tol, rtol=tol)
+    torch.testing.assert_close(m_out, ref_out, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize(("M", "N", "K"), MNK_FACTORS_DG)
+@pytest.mark.parametrize("E", E)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
+@torch.inference_mode()
+def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
+                                            monkeypatch):
+    if topk > E:
+        pytest.skip(f"Skipping test: topk={topk} > E={E}")
+
+    if not _valid_deep_gemm_shape(M, N, K):
+        pytest.skip(f"Skipping test: invalid size m={M}, n={N}, k={K}")
+
+    chunk_size = 1024
+
+    torch.manual_seed(seed)
+
+    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size))
+
+    block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
+    block_size = [block_m, block_m]
+    dtype = torch.bfloat16
+
+    a = torch.randn((M, K), dtype=dtype) / 10
+    score = torch.randn((M, E), dtype=dtype)
+
+    _, w1, w1_s, _, w2, w2_s = make_test_weights(E,
+                                                 N,
+                                                 K,
+                                                 dtype,
+                                                 torch.float8_e4m3fn,
+                                                 per_act_token_quant=False,
+                                                 block_shape=block_size)
+
+    # Note: for now use_compile will error out if the problem size is
+    # large enough to trigger chunking. I'm leaving the flag and
+    # setup code in case we are able to revisit this later.
+    use_compile = False
+
+    use_cudagraph = (chunk_size < M and N >= 1024 and K >= 1024
+                     and current_platform.is_cuda_alike())
+
+    topk_weights, topk_ids, _ = fused_topk(a, score.float(), topk, False)
+
+    # Set the context to avoid lots of warning spam.
+    with set_current_vllm_config(vllm_config):
+        ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, topk_weights,
+                                           topk_ids, block_size)
+
+        if use_compile:
+            deep_gemm_moe_fp8_fn = torch.compile(deep_gemm_moe_fp8,
+                                                 backend="inductor",
+                                                 fullgraph=True)
+            torch._dynamo.mark_dynamic(a, 0)
+            torch._dynamo.mark_dynamic(topk_weights, 0)
+            torch._dynamo.mark_dynamic(topk_ids, 0)
+        else:
+            deep_gemm_moe_fp8_fn = deep_gemm_moe_fp8
+
+        out = deep_gemm_moe_fp8_fn(a, w1, w2, w1_s, w2_s, topk_weights,
+                                   topk_ids)
+
+        if use_cudagraph:
+            out.fill_(0)
+            stream = torch.cuda.Stream()
+            graph = torch.cuda.CUDAGraph()
+            with torch.cuda.graph(graph, stream=stream):
+                out = deep_gemm_moe_fp8_fn(a, w1, w2, w1_s, w2_s, topk_weights,
+                                           topk_ids)
+            torch.cuda.synchronize()
+            graph.replay()
+            torch.cuda.synchronize()
+
+    torch.testing.assert_close(out, ref_out, atol=0.035, rtol=0.035)
diff --git a/tests/kernels/moe/test_block_int8.py b/tests/kernels/moe/test_block_int8.py
new file mode 100644
index 0000000000000..8e680c722935b
--- /dev/null
+++ b/tests/kernels/moe/test_block_int8.py
@@ -0,0 +1,147 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.kernels.moe.utils import make_test_weights
+from tests.kernels.quant_utils import (native_per_token_group_quant_int8,
+                                       native_w8a8_block_matmul)
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.platforms import current_platform
+
+if current_platform.get_device_capability() < (7, 0):
+    pytest.skip("INT8 Triton requires CUDA 7.0 or higher",
+                allow_module_level=True)
+
+vllm_config = VllmConfig()
+vllm_config.scheduler_config.max_num_seqs = 128
+vllm_config.scheduler_config.max_model_len = 8192
+
+DTYPES = [torch.half, torch.bfloat16]
+
+MNK_FACTORS = [
+    (1, 128, 128),
+    (1, 512, 512),
+    (1, 128, 7168),
+    (1, 1024, 7168),
+    (1, 4096, 128),
+    (1, 4096, 512),
+    (1, 4096, 7168),
+    (33, 128, 128),
+    (33, 512, 512),
+    (33, 128, 7168),
+    (33, 1024, 7168),
+    (33, 4096, 128),
+    (33, 4096, 512),
+    (33, 4096, 7168),
+    (128, 128, 128),
+    (128, 512, 512),
+    (128, 1024, 7168),
+    (128, 4096, 512),
+    (128, 4096, 7168),
+    (222, 128, 128),
+    (222, 512, 512),
+    (222, 1024, 7168),
+    (222, 4096, 512),
+    (222, 4096, 7168),
+    (2048, 128, 128),
+    (2048, 1024, 7168),
+    (2048, 4096, 512),
+    (2048, 4096, 7168),
+]
+
+E = [8, 24]
+TOP_KS = [2, 6]
+# BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
+BLOCK_SIZE = [[128, 128]]
+SEEDS = [0]
+
+
+# For test
+def torch_w8a8_block_int8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
+    """This function performs fused moe with block-wise quantization using
+    native torch."""
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+
+    _, block_k = block_shape[0], block_shape[1]
+    a_q, a_s = native_per_token_group_quant_int8(a, block_k)
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            inter_out = native_w8a8_block_matmul(a_q[mask],
+                                                 w1[i],
+                                                 a_s[mask],
+                                                 w1_s[i],
+                                                 block_shape,
+                                                 output_dtype=a.dtype)
+            act_out = SiluAndMul().forward_native(inter_out)
+            act_out_q, act_out_s = native_per_token_group_quant_int8(
+                act_out, block_k)
+            act_out = act_out.to(torch.float32)
+            out[mask] = native_w8a8_block_matmul(act_out_q,
+                                                 w2[i],
+                                                 act_out_s,
+                                                 w2_s[i],
+                                                 block_shape,
+                                                 output_dtype=a.dtype)
+    return (out.view(B, -1, w2.shape[1]) *
+            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+
+
+@pytest.fixture(autouse=True, scope="module")
+def setup_cuda():
+    """Sets the default CUDA device for all tests in this module."""
+    torch.set_default_device("cuda")
+
+
+@pytest.mark.parametrize(("M", "N", "K"), MNK_FACTORS)
+@pytest.mark.parametrize("E", E)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("block_size", BLOCK_SIZE)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@torch.inference_mode()
+def test_w8a8_block_int8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
+    """Tests the fused_moe kernel with W8A8 INT8 block quantization against a
+    native torch reference."""
+    torch.manual_seed(seed)
+
+    a = torch.randn((M, K), dtype=dtype) / 10
+    score = torch.randn((M, E), dtype=dtype)
+
+    _, w1, w1_s, _, w2, w2_s = make_test_weights(E,
+                                                 N,
+                                                 K,
+                                                 dtype,
+                                                 torch.int8,
+                                                 per_act_token_quant=False,
+                                                 block_shape=block_size)
+
+    # Set the context to avoid lots of warning spam.
+    with set_current_vllm_config(vllm_config):
+        out = fused_moe(
+            a,
+            w1,
+            w2,
+            score,
+            topk,
+            renormalize=False,
+            use_int8_w8a8=True,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+            block_shape=block_size,
+        )
+        ref_out = torch_w8a8_block_int8_moe(a, w1, w2, w1_s, w2_s, score, topk,
+                                            block_size)
+
+    # Check results
+    torch.testing.assert_close(out, ref_out, atol=0.065, rtol=0.065)
diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py
index 158100a098799..929db91775375 100644
--- a/tests/kernels/moe/test_cutlass_moe.py
+++ b/tests/kernels/moe/test_cutlass_moe.py
@@ -97,11 +97,9 @@ class MOETensors8Bit(MOETensors):
         n_b_scales = 2 * n if per_out_channel else 1
         k_b_scales = k if per_out_channel else 1
         # Get the right scale for tests.
-        _, a_scale = ops.scaled_fp8_quant(
-            moe_tensors_fp16.a, use_per_token_if_dynamic=per_act_token)
-        a_q, _ = ops.scaled_fp8_quant(moe_tensors_fp16.a,
-                                      a_scale,
-                                      use_per_token_if_dynamic=per_act_token)
+        a_q, a_scale = ops.scaled_fp8_quant(
+            moe_tensors_fp16.a, None, use_per_token_if_dynamic=per_act_token)
+
         w1_q = torch.empty((e, 2 * n, k), device="cuda", dtype=q_dtype)
         w2_q = torch.empty((e, k, n), device="cuda", dtype=q_dtype)
 
@@ -187,6 +185,7 @@ def run_with_expert_maps(num_experts: int, num_local_experts: int,
 def run_8_bit(moe_tensors: MOETensors8Bit,
               topk_weights: torch.Tensor,
               topk_ids: torch.Tensor,
+              per_act_token: bool,
               num_local_experts: Optional[int] = None) -> torch.Tensor:
     assert not any([
         t is None for t in [
@@ -203,7 +202,8 @@ def run_8_bit(moe_tensors: MOETensors8Bit,
         'topk_ids': topk_ids,
         'w1_scale': moe_tensors.w1_scale,
         'w2_scale': moe_tensors.w2_scale,
-        'a1_scale': moe_tensors.a_scale
+        'per_act_token': per_act_token,
+        'a1_scale': None  #moe_tensors.a_scale
     }
 
     num_experts = moe_tensors.w1.size(0)
@@ -254,11 +254,13 @@ def test_cutlass_moe_8_bit_no_graph(
         triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights,
                                       topk_ids)
 
-        cutlass_output = run_8_bit(mt, topk_weights, topk_ids)
+        cutlass_output = run_8_bit(mt, topk_weights, topk_ids, per_act_token)
 
+        # Note 5.5 only needed for larger problem sizes, 5 works ok for
+        # the rest.
         torch.testing.assert_close(triton_output,
                                    cutlass_output,
-                                   atol=5e-2,
+                                   atol=5.5e-2,
                                    rtol=1e-2)
 
 
@@ -303,7 +305,8 @@ def test_cutlass_moe_8_bit_cuda_graph(
         stream = torch.cuda.Stream()
         graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(graph, stream=stream):
-            cutlass_output = run_8_bit(mt, topk_weights, topk_ids)
+            cutlass_output = run_8_bit(mt, topk_weights, topk_ids,
+                                       per_act_token)
 
         torch.cuda.synchronize()
         graph.replay()
@@ -359,6 +362,7 @@ def test_cutlass_moe_8_bit_EP(
         cutlass_output = run_8_bit(mt,
                                    topk_weights,
                                    topk_ids,
+                                   per_act_token,
                                    num_local_experts=e // ep_size)
 
         torch.testing.assert_close(triton_output,
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index 008406c3f1593..9b861d4ebc236 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """
-Test DeepEP + DeepGEMM integration 
+Test DeepEP + DeepGEMM integration
 DeepGEMM are gemm kernels specialized for the
 fp8 block-quantized case.
 """
@@ -17,12 +17,11 @@ from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEModularKernel)
-from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    per_token_group_quant_fp8)
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_ep, has_deep_gemm
 
-from .utils import ProcessGroupInfo, parallel_launch
+from .parallel_utils import ProcessGroupInfo, parallel_launch
+from .utils import make_test_weights
 
 if has_deep_ep():
     from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
@@ -30,10 +29,9 @@ if has_deep_ep():
     from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
         DeepEPLLPrepareAndFinalize)
 
-    from .utils import DeepEPHTArgs, DeepEPLLArgs, make_deepep_a2a
+    from .parallel_utils import DeepEPHTArgs, DeepEPLLArgs, make_deepep_a2a
 
 if has_deep_gemm():
-    import deep_gemm
 
     from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
         BatchedDeepGemmExperts)
@@ -60,25 +58,6 @@ def next_power_of_2(x):
     return 2**math.ceil(math.log2(x))
 
 
-def per_block_cast_to_fp8(
-        x: torch.Tensor,
-        block_size_n: int = 128) -> tuple[torch.Tensor, torch.Tensor]:
-    assert x.dim() == 2
-    m, n = x.shape
-    x_padded = torch.zeros(
-        (deep_gemm.ceil_div(m, 128) * 128,
-         deep_gemm.ceil_div(n, block_size_n) * block_size_n),
-        dtype=x.dtype,
-        device=x.device)
-    x_padded[:m, :n] = x
-    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, block_size_n)
-    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
-    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
-    x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
-    scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
-    return x_scaled_sub, scales
-
-
 def make_block_quant_fp8_weights(
     e: int,
     n: int,
@@ -86,43 +65,11 @@ def make_block_quant_fp8_weights(
     block_size: list[int],
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     """
-    Return weights w1, w2, w1q, w2q, w1_scale, w2_scale
+    Return weights w1q, w2q, w1_scale, w2_scale
     """
-    dtype = torch.bfloat16
-
-    fp8_info = torch.finfo(torch.float8_e4m3fn)
-    fp8_max, fp8_min = fp8_info.max, fp8_info.min
-
-    w1_bf16 = torch.randn((e, 2 * n, k), dtype=dtype) / 10
-    w1_bf16 = w1_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype)
-
-    w2_bf16 = torch.randn((e, k, n), dtype=dtype) / 10
-    w2_bf16 = w2_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype)
-
-    block_n, block_k = block_size[0], block_size[1]
-    n_tiles_w1 = ((2 * n) + block_n - 1) // block_n
-    k_tiles_w1 = (k + block_k - 1) // block_k
-    n_tiles_w2 = (k + block_n - 1) // block_n
-    k_tiles_w2 = (n + block_k - 1) // block_k
-
-    w1 = torch.empty_like(w1_bf16, dtype=torch.float8_e4m3fn)
-    w2 = torch.empty_like(w2_bf16, dtype=torch.float8_e4m3fn)
-
-    w1_s = torch.empty((e, n_tiles_w1, k_tiles_w1),
-                       device="cuda",
-                       dtype=torch.float32)
-    w2_s = torch.empty((e, n_tiles_w2, k_tiles_w2),
-                       device="cuda",
-                       dtype=torch.float32)
-
-    assert w1_s.shape == (e, (2 * n + 127) // 128, (k + 127) // 128)
-    assert (w2.shape[-2] + block_n - 1) // block_n == w2_s.shape[-2]
-
-    for i in range(e):
-        w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i])
-        w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i])
-
-    return w1, w2, w1_s, w2_s
+    w1, w1q, w1_scale, w2, w2q, w2_scale = make_test_weights(
+        e, n, k, torch.bfloat16, torch.float8_e4m3fn, block_size)
+    return w1q, w2q, w1_scale, w2_scale
 
 
 @dataclasses.dataclass
@@ -132,6 +79,7 @@ class TestConfig:
     k: int
     n: int
     num_experts: int
+    per_act_token_quant: bool
     block_size: list[int]
     # configs for testing low-latency kernels
     low_latency: bool
@@ -150,8 +98,7 @@ class TestTensors:
     def make(config: TestConfig, rank) -> "TestTensors":
 
         dtype = torch.bfloat16
-        topk, m, k, block_size = (config.topk, config.m, config.k,
-                                  config.block_size)
+        topk, m, k = (config.topk, config.m, config.k)
 
         fp8_info = torch.finfo(torch.float8_e4m3fn)
         fp8_max, fp8_min = fp8_info.max, fp8_info.min
@@ -159,9 +106,7 @@ class TestTensors:
         rank_tokens = torch.randn(
             (m, k), device=torch.cuda.current_device(), dtype=dtype) / 10.0
         rank_tokens = rank_tokens.clamp(min=fp8_min, max=fp8_max)
-
-        block_k = block_size[1]
-        _, rank_token_scales = per_token_group_quant_fp8(rank_tokens, block_k)
+        rank_token_scales = None
 
         topk_ids = torch.randint(
             low=0,
@@ -201,10 +146,12 @@ def make_ll_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,
         q_dtype=q_dtype,
         block_shape=test_config.block_size)
 
-    fused_experts = BatchedDeepGemmExperts(max_num_tokens=max_tokens_per_rank,
-                                           world_size=pgi.world_size,
-                                           dp_size=dp_size,
-                                           block_shape=test_config.block_size)
+    fused_experts = BatchedDeepGemmExperts(
+        max_num_tokens=max_tokens_per_rank,
+        world_size=pgi.world_size,
+        dp_size=dp_size,
+        block_shape=test_config.block_size,
+        per_act_token_quant=test_config.per_act_token_quant)
     mk = FusedMoEModularKernel(prepare_finalize=a2a,
                                fused_experts=fused_experts)
     return mk
@@ -426,6 +373,7 @@ def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int,
     """
     Tests for High-Throughput DeepEP + DeepGemm integration.
     """
+    import deep_gemm
 
     m, n, k = mnk
     current_platform.seed_everything(7)
@@ -442,6 +390,7 @@ def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int,
                         k=k,
                         n=n,
                         num_experts=num_experts,
+                        per_act_token_quant=False,
                         block_size=block_size,
                         low_latency=False,
                         use_fp8_dispatch=None)
@@ -474,10 +423,14 @@ USE_FP8_DISPATCH = [False]
 @pytest.mark.parametrize("world_dp_size", [(2, 1)])
 @requires_deep_ep
 @requires_deep_gemm
-def test_ll_deepep_deepgemm_moe(mnk: tuple[int, int,
-                                           int], num_experts: int, topk: int,
-                                use_fp8_dispatch: bool, block_size: list[int],
-                                world_dp_size: tuple[int, int]):
+def test_ll_deepep_deepgemm_moe(
+    mnk: tuple[int, int, int],
+    num_experts: int,
+    topk: int,
+    use_fp8_dispatch: bool,
+    block_size: list[int],
+    world_dp_size: tuple[int, int],
+):
     """
     Tests for Low-Latency DeepEP + DeepGemm integration.
     """
@@ -495,6 +448,7 @@ def test_ll_deepep_deepgemm_moe(mnk: tuple[int, int,
         k=k,
         n=n,
         num_experts=num_experts,
+        per_act_token_quant=False,
         block_size=block_size,
         low_latency=True,
         use_fp8_dispatch=use_fp8_dispatch,
diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py
index 94947c809e3a3..d7df5bf770353 100644
--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@@ -23,7 +23,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_ep
 
-from .utils import ProcessGroupInfo, parallel_launch
+from .parallel_utils import ProcessGroupInfo, parallel_launch
 
 if has_deep_ep():
     from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
@@ -31,7 +31,7 @@ if has_deep_ep():
     from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
         DeepEPLLPrepareAndFinalize)
 
-    from .utils import DeepEPHTArgs, DeepEPLLArgs, make_deepep_a2a
+    from .parallel_utils import DeepEPHTArgs, DeepEPLLArgs, make_deepep_a2a
 
 requires_deep_ep = pytest.mark.skipif(
     not has_deep_ep(),
@@ -102,10 +102,6 @@ class TestTensors:
         rank_tokens = torch.randn(
             (config.m, config.k), device="cuda", dtype=token_dtype) / 10
         rank_token_scales = None
-        if config.dtype == torch.float8_e4m3fn:
-            # low_latency_mode kernels dont support per-token quant.
-            _, rank_token_scales = ops.scaled_fp8_quant(
-                rank_tokens, use_per_token_if_dynamic=not low_latency_mode)
 
         topk = torch.randint(low=0,
                              high=config.num_experts,
@@ -121,11 +117,18 @@ class TestTensors:
                            config=config)
 
 
-def make_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,
-                        low_latency_mode: bool, hidden_size: int, dp_size: int,
-                        num_experts: int, num_local_experts: int,
-                        q_dtype: Optional[torch.dtype],
-                        use_fp8_dispatch: bool) -> FusedMoEModularKernel:
+def make_modular_kernel(
+    pg: ProcessGroup,
+    pgi: ProcessGroupInfo,
+    low_latency_mode: bool,
+    hidden_size: int,
+    dp_size: int,
+    num_experts: int,
+    num_local_experts: int,
+    q_dtype: Optional[torch.dtype],
+    use_fp8_dispatch: bool,
+    per_act_token_quant: bool,
+) -> FusedMoEModularKernel:
 
     is_quantized = q_dtype is not None
 
@@ -152,6 +155,7 @@ def make_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,
                         deepep_ll_args = ll_args)
 
     if low_latency_mode:
+        assert not per_act_token_quant, "not supported in ll mode"
         fused_experts = BatchedTritonExperts(
             max_num_tokens=MAX_TOKENS_PER_RANK,
             world_size=pgi.world_size,
@@ -159,25 +163,37 @@ def make_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,
             use_fp8_w8a8=is_quantized,
             use_int8_w8a8=False,
             use_int8_w8a16=False,
-            use_int4_w4a16=False)
+            use_int4_w4a16=False,
+            per_act_token_quant=False,
+        )
     else:
-        fused_experts = TritonExperts(use_fp8_w8a8=is_quantized,
-                                      use_int8_w8a8=False,
-                                      use_int8_w8a16=False,
-                                      use_int4_w4a16=False,
-                                      per_channel_quant=False)
+        fused_experts = TritonExperts(
+            use_fp8_w8a8=is_quantized,
+            use_int8_w8a8=False,
+            use_int8_w8a16=False,
+            use_int4_w4a16=False,
+            per_act_token_quant=per_act_token_quant,
+        )
 
     mk = FusedMoEModularKernel(prepare_finalize=a2a,
                                fused_experts=fused_experts)
     return mk
 
 
-def deep_ep_moe_impl(pg: ProcessGroup, pgi: ProcessGroupInfo,
-                     low_latency_mode: bool, dp_size: int,
-                     test_tensors: TestTensors, w1: torch.Tensor,
-                     w2: torch.Tensor, w1_scale: Optional[torch.Tensor],
-                     w2_scale: Optional[torch.Tensor], num_experts: int,
-                     use_fp8_dispatch: bool) -> torch.Tensor:
+def deep_ep_moe_impl(
+    pg: ProcessGroup,
+    pgi: ProcessGroupInfo,
+    low_latency_mode: bool,
+    dp_size: int,
+    test_tensors: TestTensors,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: Optional[torch.Tensor],
+    w2_scale: Optional[torch.Tensor],
+    num_experts: int,
+    use_fp8_dispatch: bool,
+    per_act_token_quant: bool,
+) -> torch.Tensor:
 
     num_local_experts = w1.size(0)
 
@@ -199,11 +215,9 @@ def deep_ep_moe_impl(pg: ProcessGroup, pgi: ProcessGroupInfo,
         q_dtype = torch.float8_e4m3fn
 
     # Make modular kernel
-    mk: FusedMoEModularKernel = make_modular_kernel(pg, pgi, low_latency_mode,
-                                                    hidden_size, dp_size,
-                                                    num_experts,
-                                                    num_local_experts, q_dtype,
-                                                    use_fp8_dispatch)
+    mk: FusedMoEModularKernel = make_modular_kernel(
+        pg, pgi, low_latency_mode, hidden_size, dp_size, num_experts,
+        num_local_experts, q_dtype, use_fp8_dispatch, per_act_token_quant)
 
     out_hidden_states = torch.empty_like(test_tensors.rank_tokens)
     total_num_tokens = test_tensors.rank_tokens.size(0)
@@ -257,9 +271,15 @@ def deep_ep_moe_impl(pg: ProcessGroup, pgi: ProcessGroupInfo,
     return out_hidden_states
 
 
-def torch_moe_impl(test_tensors: TestTensors, w1: torch.Tensor,
-                   w2: torch.Tensor, w1_scale: Optional[torch.Tensor],
-                   w2_scale: Optional[torch.Tensor], using_fp8_dispatch: bool):
+def torch_moe_impl(
+    test_tensors: TestTensors,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: Optional[torch.Tensor],
+    w2_scale: Optional[torch.Tensor],
+    using_fp8_dispatch: bool,
+    per_act_token_quant: bool,
+):
 
     a, topk_ids, topk_weights = (test_tensors.rank_tokens, test_tensors.topk,
                                  test_tensors.topk_weights)
@@ -267,6 +287,7 @@ def torch_moe_impl(test_tensors: TestTensors, w1: torch.Tensor,
         # The DeepEP implementation is requested to dispatch using FP8.
         # For numerical stability for testing, emulate the fp8 dispatch by
         # blockwise quant and de-quant.
+        assert not per_act_token_quant
         a = test_tensors.rank_tokens
         aq, aq_scale = per_token_group_quant_fp8(a, 128)
         a = (aq.view(-1, 128).to(torch.float32) * aq_scale.view(-1, 1)).view(
@@ -310,6 +331,7 @@ def _deep_ep_moe(
     w1_scale: Optional[torch.Tensor],
     w2_scale: Optional[torch.Tensor],
     use_fp8_dispatch: bool,
+    per_act_token_quant: bool,
 ):
 
     if not low_latency_mode:
@@ -331,7 +353,8 @@ def _deep_ep_moe(
     with set_current_vllm_config(VllmConfig()):
         # Reference
         torch_combined = torch_moe_impl(test_tensors, w1, w2, w1_scale,
-                                        w2_scale, use_fp8_dispatch)
+                                        w2_scale, use_fp8_dispatch,
+                                        per_act_token_quant)
 
         # Splice experts for this rank.
         num_local_experts = config.num_experts // pgi.world_size
@@ -356,6 +379,7 @@ def _deep_ep_moe(
             w2_scale_ep,
             config.num_experts,
             use_fp8_dispatch,
+            per_act_token_quant,
         )
 
     torch.testing.assert_close(
@@ -384,10 +408,16 @@ DTYPES = [torch.bfloat16, torch.float8_e4m3fn]
 @pytest.mark.parametrize("num_experts", [32])
 @pytest.mark.parametrize("topk", [6])
 @pytest.mark.parametrize("world_dp_size", [(2, 1)])
+@pytest.mark.parametrize("per_act_token_quant", [False, True])
 @requires_deep_ep
-def test_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int],
-                     num_experts: int, topk: int, world_dp_size: tuple[int,
-                                                                       int]):
+def test_deep_ep_moe(
+    dtype: torch.dtype,
+    mnk: tuple[int, int, int],
+    num_experts: int,
+    topk: int,
+    world_dp_size: tuple[int, int],
+    per_act_token_quant: bool,
+):
     low_latency_mode = False
     use_fp8_dispatch = False
     m, n, k = mnk
@@ -404,7 +434,8 @@ def test_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int],
     w1, w2, w1_scale, w2_scale = make_weights(num_experts, n, k, dtype)
 
     parallel_launch(world_size, _deep_ep_moe, low_latency_mode, dp_size,
-                    config, w1, w2, w1_scale, w2_scale, use_fp8_dispatch)
+                    config, w1, w2, w1_scale, w2_scale, use_fp8_dispatch,
+                    per_act_token_quant)
 
 
 MNKs = [
@@ -454,4 +485,5 @@ def test_low_latency_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int],
     w1, w2, w1_scale, w2_scale = make_weights(num_experts, n, k, dtype)
 
     parallel_launch(world_size, _deep_ep_moe, low_latency_mode, dp_size,
-                    config, w1, w2, w1_scale, w2_scale, use_fp8_dispatch)
+                    config, w1, w2, w1_scale, w2_scale, use_fp8_dispatch,
+                    False)
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index 0c31168566e24..96e3f29b3d799 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -17,6 +17,7 @@ from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
 import vllm.model_executor.layers.fused_moe  # noqa
 from tests.kernels.utils import opcheck, stack_and_dev, torch_moe
 from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.distributed.parallel_state import init_distributed_environment
 from vllm.forward_context import set_forward_context
 from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.layers.fused_moe.fused_moe import (
@@ -142,6 +143,10 @@ def test_fused_moe(
     # Setup test data
     #
 
+    #
+    # Setup test data
+    #
+
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
     w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
     w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
@@ -169,7 +174,7 @@ def test_fused_moe(
                                               use_int8_w8a8=False,
                                               use_int8_w8a16=False,
                                               use_int4_w4a16=False,
-                                              per_channel_quant=False,
+                                              per_act_token_quant=False,
                                               block_shape=None)
 
     def m_fused_moe(
@@ -365,6 +370,13 @@ def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool,
         if dtype == torch.float32:
             pytest.skip("AITER ROCm test skip for float32")
 
+    monkeypatch.setenv('RANK', "0")
+    monkeypatch.setenv('LOCAL_RANK', "0")
+    monkeypatch.setenv('WORLD_SIZE', "1")
+    monkeypatch.setenv('MASTER_ADDR', 'localhost')
+    monkeypatch.setenv('MASTER_PORT', '12345')
+    init_distributed_environment()
+
     # Instantiate our and huggingface's MoE blocks
     vllm_config.compilation_config.static_forward_context = dict()
     with (set_current_vllm_config(vllm_config),
diff --git a/tests/kernels/moe/test_nvfp4_moe.py b/tests/kernels/moe/test_nvfp4_moe.py
index 76b560e1bb412..3f5412e75821a 100644
--- a/tests/kernels/moe/test_nvfp4_moe.py
+++ b/tests/kernels/moe/test_nvfp4_moe.py
@@ -14,7 +14,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.platforms import current_platform
 
 if not current_platform.has_device_capability(100):
-    pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.",
+    pytest.skip("Nvfp4 Requires compute capability of 10 or above.",
                 allow_module_level=True)
 
 MNK_FACTORS = [
diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py
index ee2bdc838b0d1..184c2dd2f9047 100644
--- a/tests/kernels/moe/test_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -15,7 +15,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEModularKernel)
 from vllm.platforms import current_platform
 
-from .utils import ProcessGroupInfo, parallel_launch
+from .parallel_utils import ProcessGroupInfo, parallel_launch
 
 try:
     from pplx_kernels import AllToAll
@@ -93,7 +93,7 @@ def pplx_cutlass_moe(
         num_experts=num_experts,
         experts_per_token=topk,
         rank=rank,
-        world_size=pgi.world_size,
+        world_size=world_size,
         dp_size=dp_size,
         hidden_dim=hidden_dim,
         hidden_dim_bytes=hidden_dim,  # because a.dtype.itemsize == 1
@@ -118,8 +118,6 @@ def pplx_cutlass_moe(
         pgi.world_size,
         rank,
         dp_size,
-        quant_dtype=torch.float8_e4m3fn,
-        per_act_token=per_act_token,
     )
 
     experts = CutlassExpertsFp8((num_experts + world_size - 1) // world_size,
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index 1da14eddff317..186e00800a17d 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -18,18 +18,20 @@ try:
 except ImportError:
     has_pplx = False
 
+from tests.kernels.moe.utils import make_test_weights, naive_batched_moe
 from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe import override_config
+from vllm.model_executor.layers.fused_moe import fused_topk, override_config
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedExperts, BatchedPrepareAndFinalize, BatchedTritonExperts)
-from vllm.model_executor.layers.fused_moe.fused_moe import (fused_topk,
-                                                            get_default_config)
+    BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts)
+from vllm.model_executor.layers.fused_moe.fused_moe import get_default_config
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEModularKernel)
 from vllm.platforms import current_platform
+from vllm.utils import round_up
 
-from .utils import ProcessGroupInfo, parallel_launch
+from .parallel_utils import ProcessGroupInfo, parallel_launch
 
 requires_pplx = pytest.mark.skipif(
     not has_pplx,
@@ -144,25 +146,6 @@ def torch_batched_moe(
     return torch_finalize(out, topk_weight, topk_ids)
 
 
-def batched_moe(
-    a: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weight: torch.Tensor,
-    topk_ids: torch.Tensor,
-) -> torch.Tensor:
-    num_experts = w1.shape[0]
-
-    fused_experts = FusedMoEModularKernel(
-        BatchedPrepareAndFinalize(max_num_tokens=a.shape[0],
-                                  world_size=1,
-                                  dp_size=1,
-                                  rank=0),
-        BatchedExperts(max_num_tokens=a.shape[0], dp_size=1, world_size=1))
-
-    return fused_experts(a, w1, w2, topk_weight, topk_ids, num_experts)
-
-
 @pytest.mark.parametrize("m", [1, 33, 64, 222])
 @pytest.mark.parametrize("n", [128, 1024, 2048])
 @pytest.mark.parametrize("k", [128, 512, 1024])
@@ -188,7 +171,7 @@ def test_fused_moe_batched_experts(
         topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
         baseline_output = torch_experts(a, w1, w2, topk_weight, topk_ids)
         torch_output = torch_batched_moe(a, w1, w2, topk_weight, topk_ids)
-        batched_output = batched_moe(a, w1, w2, topk_weight, topk_ids)
+        batched_output = naive_batched_moe(a, w1, w2, topk_weight, topk_ids)
 
     torch.testing.assert_close(baseline_output,
                                torch_output,
@@ -226,7 +209,6 @@ def pplx_prepare_finalize(
 
     topk = topk_ids.shape[1]
     num_tokens, hidden_dim = a.shape
-    block_size = 128
     device = pgi.device
     rank = pgi.rank
     world_size = pgi.world_size
@@ -241,9 +223,7 @@ def pplx_prepare_finalize(
         dp_size=dp_size,
         hidden_dim=hidden_dim,
         hidden_dim_bytes=hidden_dim * a.dtype.itemsize,
-        hidden_dim_scale_bytes=(0 if a.dtype.itemsize != 1 else
-                                ((hidden_dim + block_size - 1) // block_size *
-                                 torch.float32.itemsize)),
+        hidden_dim_scale_bytes=0,
     )
 
     if group_name is None:
@@ -260,7 +240,6 @@ def pplx_prepare_finalize(
         world_size,
         rank,
         dp_size,
-        a.dtype,
     )
 
     a_chunk = chunk_by_rank(a, rank, world_size).to(device)
@@ -276,6 +255,7 @@ def pplx_prepare_finalize(
         num_experts,
         None,
         False,
+        FusedMoEQuantConfig(),
     )
 
     b_a = b_a * 1.5
@@ -350,6 +330,7 @@ def _pplx_prepare_finalize(
 # TODO (bnell): this test point does not work for odd M due to how the test is
 # written, not due to limitations of the pplx kernels.  The pplx_moe
 # test below is able to deal with odd M.
+# TODO (bnell) add fp8 tests
 @pytest.mark.parametrize("mnk", PPLX_PREPARE_COMBOS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
@@ -386,18 +367,31 @@ def pplx_moe(
     w2: torch.Tensor,
     topk_weight: torch.Tensor,
     topk_ids: torch.Tensor,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    qtype: Optional[torch.dtype] = None,
+    per_act_token_quant=False,
+    block_shape: Optional[list[int]] = None,
     use_compile: bool = False,
     use_cudagraphs: bool = True,
 ) -> torch.Tensor:
     from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
-        PplxPrepareAndFinalize)
+        PplxPrepareAndFinalize, pplx_hidden_dim_scale_bytes)
 
     device = torch.device("cuda", rank)
     hidden_dim = a.shape[1]
     num_experts = w1.shape[0]
-    block_size = 128
     topk = topk_ids.shape[1]
-    max_num_tokens = rank_chunk(a.shape[0], 0, world_size)
+    max_num_tokens = round_up(rank_chunk(a.shape[0], 0, world_size), 64)
+
+    hidden_dim_bytes, scale_bytes = pplx_hidden_dim_scale_bytes(
+        max_num_tokens,
+        hidden_dim,
+        a.dtype,
+        qtype,
+        per_act_token_quant=per_act_token_quant,
+        block_shape=block_shape,
+    )
 
     args = dict(
         max_num_tokens=max_num_tokens,
@@ -407,10 +401,8 @@ def pplx_moe(
         world_size=world_size,
         dp_size=dp_size,
         hidden_dim=hidden_dim,
-        hidden_dim_bytes=hidden_dim * a.dtype.itemsize,
-        hidden_dim_scale_bytes=(0 if a.dtype.itemsize != 1 else
-                                ((hidden_dim + block_size - 1) // block_size *
-                                 torch.float32.itemsize)),
+        hidden_dim_bytes=hidden_dim_bytes,
+        hidden_dim_scale_bytes=scale_bytes,
     )
 
     if group_name is None:
@@ -429,9 +421,11 @@ def pplx_moe(
         dp_size,
     )
 
-    experts = BatchedTritonExperts(max_num_tokens=a.shape[0],
+    experts = BatchedTritonExperts(max_num_tokens=max_num_tokens,
                                    world_size=world_size,
-                                   dp_size=dp_size)
+                                   dp_size=dp_size,
+                                   use_fp8_w8a8=qtype == torch.float8_e4m3fn,
+                                   block_shape=block_shape)
 
     fused_experts = FusedMoEModularKernel(
         prepare_finalize,
@@ -447,6 +441,13 @@ def pplx_moe(
     w1_chunk = chunk_by_rank(w1, rank, world_size).to(device)
     w2_chunk = chunk_by_rank(w2, rank, world_size).to(device)
 
+    if w1_scale is not None:
+        w1_scale_chunk = chunk_by_rank(w1_scale, rank, world_size).to(device)
+        w2_scale_chunk = chunk_by_rank(w2_scale, rank, world_size).to(device)
+    else:
+        w1_scale_chunk = None
+        w2_scale_chunk = None
+
     # Note: for now use_compile will error out if the problem size is
     # large enough to trigger chunking. I'm leaving the flag and
     # setup code in case we are able to revisit this later.
@@ -465,6 +466,8 @@ def pplx_moe(
                          w2_chunk,
                          chunk_topk_weight,
                          chunk_topk_ids,
+                         w1_scale=w1_scale_chunk,
+                         w2_scale=w2_scale_chunk,
                          global_num_experts=num_experts)
 
     if use_cudagraphs:
@@ -477,6 +480,8 @@ def pplx_moe(
                                  w2_chunk,
                                  chunk_topk_weight,
                                  chunk_topk_ids,
+                                 w1_scale=w1_scale_chunk,
+                                 w2_scale=w2_scale_chunk,
                                  global_num_experts=num_experts)
 
         torch.cuda.synchronize()
@@ -505,9 +510,9 @@ def _batched_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids):
         rank=rank,
     )
 
-    experts = BatchedExperts(max_num_tokens=a.shape[0],
-                             world_size=1,
-                             dp_size=1)
+    experts = NaiveBatchedExperts(max_num_tokens=a.shape[0],
+                                  world_size=1,
+                                  dp_size=1)
 
     fused_experts = FusedMoEModularKernel(
         prepare_finalize,
@@ -539,7 +544,12 @@ def _pplx_moe(
     w2: torch.Tensor,
     score: torch.Tensor,
     topk: int,
-    use_internode: bool,
+    w1_s: Optional[torch.Tensor] = None,
+    w2_s: Optional[torch.Tensor] = None,
+    qtype: Optional[torch.dtype] = None,
+    per_act_token_quant: bool = False,
+    block_shape: Optional[list[int]] = None,
+    use_internode: bool = False,
 ):
     if use_internode:
         uid = nvshmem_get_unique_id(
@@ -557,11 +567,28 @@ def _pplx_moe(
 
     moe_config = get_default_config(m, e, n, k, topk, a.dtype, False)
 
+    device = torch.device("cuda", pgi.rank)
+    a = a.to(device)
+    w1 = w1.to(device)
+    w2 = w2.to(device)
+    w1_s = w1_s.to(device) if w1_s is not None else None
+    w2_s = w2_s.to(device) if w2_s is not None else None
+
     with set_current_vllm_config(vllm_config), override_config(moe_config):
         topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-        torch_output = torch_experts(a, w1, w2, topk_weight, topk_ids)
+        torch_output = torch_experts(a,
+                                     w1,
+                                     w2,
+                                     topk_weight,
+                                     topk_ids,
+                                     w1_scale=w1_s,
+                                     w2_scale=w2_s,
+                                     quant_dtype=qtype,
+                                     per_act_token_quant=per_act_token_quant,
+                                     block_shape=block_shape)
         pplx_output = pplx_moe(group_name, pgi.rank, pgi.world_size, dp_size,
-                               a, w1, w2, topk_weight, topk_ids)
+                               a, w1, w2, topk_weight, topk_ids, w1_s, w2_s,
+                               qtype, per_act_token_quant, block_shape)
         # TODO (bnell): fix + re-enable
         #batched_output = _batched_moe(pgi, dp_size, a, w1, w2, topk_weight,
         #                              topk_ids)
@@ -581,6 +608,8 @@ def _pplx_moe(
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])
+@pytest.mark.parametrize("per_act_token_quant", [False, True])
+@pytest.mark.parametrize("block_shape", [None, [128, 128]])
 @pytest.mark.parametrize("use_internode", [False])
 @requires_pplx
 def test_pplx_moe(
@@ -589,15 +618,33 @@ def test_pplx_moe(
     topk: int,
     dtype: torch.dtype,
     world_dp_size: tuple[int, int],
+    per_act_token_quant: bool,
+    block_shape: Optional[list[int]],
     use_internode: bool,
 ):
     current_platform.seed_everything(7)
     m, n, k = mnk
     world_size, dp_size = world_dp_size
-    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
-    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
-    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
-    score = torch.randn((m, e), device="cuda", dtype=dtype)
+
+    if dtype == torch.float8_e4m3fn:
+        use_fp8_w8a8 = True
+        quant_dtype = dtype
+    else:
+        use_fp8_w8a8 = False
+        quant_dtype = None
+
+    if not use_fp8_w8a8 and per_act_token_quant and block_shape is not None:
+        pytest.skip("Skip quantization test for non-quantized type")
+
+    a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
+    score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
+
+    _, w1, w1_s, _, w2, w2_s = make_test_weights(e,
+                                                 n,
+                                                 k,
+                                                 quant_dtype=quant_dtype,
+                                                 block_shape=block_shape)
 
     parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk,
+                    w1_s, w2_s, quant_dtype, per_act_token_quant, block_shape,
                     use_internode)
diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py
index e317ccbdb4a77..5b1048797447b 100644
--- a/tests/kernels/moe/utils.py
+++ b/tests/kernels/moe/utils.py
@@ -1,194 +1,249 @@
 # SPDX-License-Identifier: Apache-2.0
-"""
-DeepEP test utilities
-"""
-import dataclasses
-import importlib
-import os
-import traceback
-from typing import Callable, Optional
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
 
 import torch
-from torch.distributed import ProcessGroup
-from torch.multiprocessing import (
-    spawn)  # pyright: ignore[reportPrivateImportUsage]
-from typing_extensions import Concatenate, ParamSpec
 
-from vllm.utils import get_open_port
-
-has_deep_ep = importlib.util.find_spec("deep_ep") is not None
-if has_deep_ep:
-    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
-        DeepEPHTPrepareAndFinalize)
-    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
-        DeepEPLLPrepareAndFinalize)
-
-## Parallel Processes Utils
-
-P = ParamSpec("P")
+import vllm._custom_ops as ops
+from tests.kernels.quant_utils import (per_block_cast_to_fp8,
+                                       per_block_cast_to_int8)
+from vllm.model_executor.layers.fused_moe import fused_experts
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+    BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel)
+from vllm.model_executor.layers.fused_moe.utils import (
+    moe_kernel_quantize_input)
+from vllm.utils import round_up
 
 
-@dataclasses.dataclass
-class ProcessGroupInfo:
-    world_size: int
-    world_local_size: int
-    rank: int
-    node_rank: int
-    local_rank: int
-    device: torch.device
+def triton_moe(
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    quant_dtype: Optional[torch.dtype] = None,
+    per_act_token_quant=False,
+    block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+    return fused_experts(a,
+                         w1,
+                         w2,
+                         topk_weight,
+                         topk_ids,
+                         w1_scale=w1_scale,
+                         w2_scale=w2_scale,
+                         a1_scale=a1_scale,
+                         a2_scale=a2_scale,
+                         per_channel_quant=per_act_token_quant,
+                         use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
+                         block_shape=block_shape)
 
 
-def _worker_parallel_launch(
-    local_rank: int,
-    world_size: int,
-    world_local_size: int,
-    node_rank: int,
-    init_method: str,
-    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
-    *args: P.args,
-    **kwargs: P.kwargs,
-) -> None:
-    rank = node_rank * world_local_size + local_rank
-    torch.cuda.set_device(local_rank)
-    device = torch.device("cuda", local_rank)
-    torch.distributed.init_process_group(
-        backend="cpu:gloo,cuda:nccl",
-        init_method=init_method,
-        rank=rank,
-        world_size=world_size,
-        device_id=device,
-    )
-    barrier = torch.tensor([rank], device=device)
-    torch.distributed.all_reduce(barrier)
+def batched_moe(
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    quant_dtype: Optional[torch.dtype] = None,
+    per_act_token_quant: bool = False,
+    block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+    max_num_tokens = round_up(a.shape[0], 64)
 
-    try:
-        worker(
-            ProcessGroupInfo(
-                world_size=world_size,
-                world_local_size=world_local_size,
-                rank=rank,
-                node_rank=node_rank,
-                local_rank=local_rank,
-                device=device,
-            ),
-            *args,
-            **kwargs,
-        )
-    except Exception as ex:
-        print(ex)
-        traceback.print_exc()
-        raise
-    finally:
-        torch.distributed.destroy_process_group()
-
-
-def parallel_launch(
-    world_size: int,
-    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
-    *args: P.args,
-    **kwargs: P.kwargs,
-) -> None:
-    assert not kwargs
-    spawn(
-        _worker_parallel_launch,
-        args=(
-            world_size,
-            world_size,
-            0,
-            f"tcp://{os.getenv('LOCALHOST', 'localhost')}:{get_open_port()}",
-            worker,
-        ) + args,
-        nprocs=world_size,
-        join=True,
+    fused_experts = FusedMoEModularKernel(
+        BatchedPrepareAndFinalize(max_num_tokens,
+                                  world_size=1,
+                                  dp_size=1,
+                                  rank=0),
+        BatchedTritonExperts(
+            max_num_tokens=max_num_tokens,
+            world_size=1,
+            dp_size=1,
+            use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
+            per_act_token_quant=per_act_token_quant,
+            block_shape=block_shape,
+        ),
     )
 
-
-## DeepEP specific utils
+    return fused_experts(a,
+                         w1,
+                         w2,
+                         topk_weight,
+                         topk_ids,
+                         w1_scale=w1_scale,
+                         w2_scale=w2_scale,
+                         a1_scale=a1_scale,
+                         a2_scale=a2_scale)
 
 
-@dataclasses.dataclass
-class DeepEPHTArgs:
-    num_local_experts: int
+def naive_batched_moe(
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    quant_dtype: Optional[torch.dtype] = None,
+    per_act_token_quant: bool = False,
+    block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+    max_num_tokens = round_up(a.shape[0], 64)
 
-
-@dataclasses.dataclass
-class DeepEPLLArgs:
-    max_tokens_per_rank: int
-    hidden_size: int
-    num_experts: int
-    use_fp8_dispatch: bool
-
-
-def make_deepep_ht_a2a(pg: ProcessGroup,
-                       pgi: ProcessGroupInfo,
-                       dp_size: int,
-                       ht_args: DeepEPHTArgs,
-                       q_dtype: Optional[torch.dtype] = None,
-                       block_shape: Optional[list[int]] = None):
-
-    import deep_ep
-
-    # high throughput a2a
-    num_nvl_bytes = 1024 * 1024 * 1024  # 1GB
-    num_rdma_bytes, low_latency_mode, num_qps_per_rank = 0, False, 1
-    buffer = deep_ep.Buffer(group=pg,
-                            num_nvl_bytes=num_nvl_bytes,
-                            num_rdma_bytes=num_rdma_bytes,
-                            low_latency_mode=low_latency_mode,
-                            num_qps_per_rank=num_qps_per_rank)
-    return DeepEPHTPrepareAndFinalize(buffer=buffer,
-                                      world_size=pgi.world_size,
-                                      rank=pgi.rank,
-                                      dp_size=dp_size,
-                                      rank_expert_offset=pgi.rank *
-                                      ht_args.num_local_experts,
-                                      quant_dtype=q_dtype,
-                                      block_shape=block_shape)
-
-
-def make_deepep_ll_a2a(pg: ProcessGroup,
-                       pgi: ProcessGroupInfo,
-                       dp_size: int,
-                       deepep_ll_args: DeepEPLLArgs,
-                       q_dtype: Optional[torch.dtype] = None,
-                       block_shape: Optional[list[int]] = None):
-
-    import deep_ep
-
-    # low-latency a2a
-    num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
-        deepep_ll_args.max_tokens_per_rank, deepep_ll_args.hidden_size,
-        pgi.world_size, deepep_ll_args.num_experts)
-
-    buffer = deep_ep.Buffer(group=pg,
-                            num_rdma_bytes=num_rdma_bytes,
-                            low_latency_mode=True,
-                            num_qps_per_rank=deepep_ll_args.num_experts //
-                            pgi.world_size)
-
-    return DeepEPLLPrepareAndFinalize(
-        buffer=buffer,
-        world_size=pgi.world_size,
-        dp_size=dp_size,
-        max_tokens_per_rank=deepep_ll_args.max_tokens_per_rank,
-        quant_dtype=q_dtype,
-        block_shape=block_shape,
-        use_fp8_dispatch=deepep_ll_args.use_fp8_dispatch,
+    fused_experts = FusedMoEModularKernel(
+        BatchedPrepareAndFinalize(max_num_tokens,
+                                  world_size=1,
+                                  dp_size=1,
+                                  rank=0),
+        NaiveBatchedExperts(
+            max_num_tokens=max_num_tokens,
+            dp_size=1,
+            world_size=1,
+            use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
+            per_act_token_quant=per_act_token_quant,
+            block_shape=block_shape,
+        ),
     )
 
+    return fused_experts(a,
+                         w1,
+                         w2,
+                         topk_weight,
+                         topk_ids,
+                         w1_scale=w1_scale,
+                         w2_scale=w2_scale,
+                         a1_scale=a1_scale,
+                         a2_scale=a2_scale)
 
-def make_deepep_a2a(pg: ProcessGroup,
-                    pgi: ProcessGroupInfo,
-                    dp_size: int,
-                    deepep_ht_args: Optional[DeepEPHTArgs],
-                    deepep_ll_args: Optional[DeepEPLLArgs],
-                    q_dtype: Optional[torch.dtype] = None,
-                    block_shape: Optional[list[int]] = None):
-    if deepep_ht_args is not None:
-        assert deepep_ll_args is None
-        return make_deepep_ht_a2a(pg, pgi, dp_size, deepep_ht_args, q_dtype,
-                                  block_shape)
 
-    assert deepep_ll_args is not None
-    return make_deepep_ll_a2a(pg, pgi, dp_size, deepep_ll_args, q_dtype,
-                              block_shape)
+def chunk_scales(scales: Optional[torch.Tensor], start: int,
+                 end: int) -> Optional[torch.Tensor]:
+    if scales is not None:
+        if scales.numel() == 1:
+            return scales
+        else:
+            return scales[start:end]
+    return None
+
+
+def make_quantized_test_activations(
+    E: int,
+    m: int,
+    k: int,
+    in_dtype: torch.dtype,
+    quant_dtype: Optional[torch.dtype] = None,
+    block_shape: Optional[list[int]] = None,
+    per_act_token_quant: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    a = torch.randn((E, m, k), device="cuda", dtype=in_dtype) / 10
+    a_q = a
+    a_scale = None
+
+    if quant_dtype is not None:
+        assert (quant_dtype == torch.float8_e4m3fn
+                or quant_dtype == torch.int8), "only fp8/int8 supported"
+        a_q = torch.zeros_like(a, dtype=quant_dtype)
+        a_scale_l = [None] * E
+        for e in range(E):
+            a_q[e], a_scale_l[e] = moe_kernel_quantize_input(
+                a[e], None, quant_dtype, per_act_token_quant, block_shape)
+        a_scale = torch.stack(a_scale_l)
+
+        if not per_act_token_quant and block_shape is None:
+            a_scale = a_scale.view(E, 1, 1)
+
+    return a, a_q, a_scale
+
+
+def moe_quantize_weights(
+    w: torch.Tensor,
+    w_s: Optional[torch.Tensor],
+    quant_dtype: Optional[torch.dtype],
+    per_token_quant: bool,
+    block_shape: Optional[list[int]],
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    assert (quant_dtype == torch.float8_e4m3fn
+            or quant_dtype == torch.int8), "only fp8/int8 supported"
+
+    if block_shape is not None:
+        assert not per_token_quant
+        if quant_dtype == torch.int8:
+            w, w_s = per_block_cast_to_int8(w, block_shape)
+        else:
+            w, w_s = per_block_cast_to_fp8(w, block_shape)
+    else:
+        if quant_dtype == torch.int8:
+            w, w_s = ops.scaled_int8_quant(
+                w, w_s, use_per_token_if_dynamic=per_token_quant)
+        else:
+            w, w_s = ops.scaled_fp8_quant(
+                w, w_s, use_per_token_if_dynamic=per_token_quant)
+
+    return w, w_s
+
+
+def make_test_weight(
+    e: int,
+    rows: int,
+    cols: int,
+    in_dtype: torch.dtype = torch.bfloat16,
+    quant_dtype: Optional[torch.dtype] = None,
+    block_shape: Optional[list[int]] = None,
+    per_act_token_quant: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    w_16 = torch.randn((e, rows, cols), device="cuda", dtype=in_dtype) / 15
+
+    if quant_dtype is not None:
+        w_l = [None] * e
+        w_s_l = [None] * e
+        for idx in range(e):
+            w_l[idx], w_s_l[idx] = moe_quantize_weights(
+                w_16[idx], None, quant_dtype, per_act_token_quant, block_shape)
+
+        w = torch.stack(w_l)
+        w_s = torch.stack(w_s_l)
+        if w_s.ndim == 2:
+            assert w_s.shape[-1] == 1
+            w_s = w_s.view(-1, 1, 1)
+
+        if block_shape is not None:
+            block_n, block_k = block_shape
+            n_tiles = (rows + block_n - 1) // block_n
+            k_tiles = (cols + block_k - 1) // block_k
+            assert w_s.shape == (e, n_tiles, k_tiles)
+    else:
+        w = w_16
+        w_s = None
+
+    return w_16, w, w_s
+
+
+def make_test_weights(
+    e: int,
+    n: int,
+    k: int,
+    in_dtype: torch.dtype = torch.bfloat16,
+    quant_dtype: Optional[torch.dtype] = None,
+    block_shape: Optional[list[int]] = None,
+    per_act_token_quant: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], torch.Tensor,
+           torch.Tensor, Optional[torch.Tensor]]:
+    return (
+        *make_test_weight(e, 2 * n, k, in_dtype, quant_dtype, block_shape,
+                          per_act_token_quant),
+        *make_test_weight(e, k, n, in_dtype, quant_dtype, block_shape,
+                          per_act_token_quant),
+    )
diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
index 0840cc7b54fcb..d0dc85f25755b 100644
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -5,7 +5,10 @@ from typing import Optional, Union
 
 import torch
 
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    group_broadcast)
 from vllm.platforms import current_platform
+from vllm.utils import round_up
 
 # Using the default value (240.0) from pytorch will cause accuracy
 # issue on dynamic quantization models. Here use 224.0 for rocm.
@@ -94,9 +97,15 @@ def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
     return ref_out, ref_scale.view((1, ))
 
 
-def native_w8a8_block_matmul(A: torch.Tensor, B: torch.Tensor,
-                             As: torch.Tensor, Bs: torch.Tensor, block_size,
-                             output_dtype):
+def native_w8a8_block_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype,
+    compute_type: torch.dtype = torch.float32,
+) -> torch.Tensor:
     """This function performs matrix multiplication with block-wise
     quantization using native torch.
     It is agnostic to the input data type and can be used for both int8 and
@@ -106,8 +115,8 @@ def native_w8a8_block_matmul(A: torch.Tensor, B: torch.Tensor,
     `Bs` (float32).
     The output is returned in the specified `output_dtype`.
     """
-    A = A.to(torch.float32)
-    B = B.to(torch.float32)
+    A = A.to(compute_type)
+    B = B.to(compute_type)
     assert A.shape[-1] == B.shape[-1]
     assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
     assert len(block_size) == 2
@@ -122,11 +131,11 @@ def native_w8a8_block_matmul(A: torch.Tensor, B: torch.Tensor,
     As = As.reshape(M, As.shape[-1])
     n_tiles = (N + block_n - 1) // block_n
     k_tiles = (K + block_k - 1) // block_k
-    assert n_tiles == Bs.shape[0]
-    assert k_tiles == Bs.shape[1]
+    assert n_tiles == Bs.shape[0], f"{n_tiles} == {Bs.shape[0]}"
+    assert k_tiles == Bs.shape[1], f"{k_tiles} == {Bs.shape[1]}"
 
     C_shape = (M, N)
-    C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
+    C = torch.zeros(C_shape, dtype=compute_type, device=A.device)
 
     A_tiles = [
         A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
@@ -152,3 +161,152 @@ def native_w8a8_block_matmul(A: torch.Tensor, B: torch.Tensor,
 
     C = C.reshape(origin_C_shape).to(output_dtype)
     return C
+
+
+def native_per_token_group_quant_fp8(x,
+                                     group_size,
+                                     eps=1e-10,
+                                     dtype=torch.float8_e4m3fn):
+    """Function to perform per-token-group quantization on an input tensor
+    `x` using native torch."""
+    assert x.shape[-1] % group_size == 0, ("the last dimension of `x` must "
+                                           "be divisible by `group_size`")
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    finfo = torch.finfo(dtype)
+    fp8_min = finfo.min
+    fp8_max = finfo.max
+
+    x_ = x.reshape(x.numel() // group_size, group_size)
+    amax = x_.abs().max(dim=-1,
+                        keepdim=True)[0].clamp(min=eps).to(torch.float32)
+    x_s = amax / fp8_max
+    x_q = (x_ / x_s).clamp(min=fp8_min, max=fp8_max).to(dtype)
+    x_q = x_q.reshape(x.shape)
+    x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size, ))
+
+    return x_q, x_s
+
+
+def native_per_token_group_quant_int8(x,
+                                      group_size,
+                                      eps=1e-10,
+                                      dtype=torch.int8):
+    """Function to perform per-token-group quantization on an input tensor
+    `x` using native torch.
+
+    It converts the tensor values into int8 values and returns the
+    quantized tensor along with the scaling factor used for quantization.
+    """
+    assert (x.shape[-1] % group_size == 0
+            ), "the last dimension of `x` must be divisible by `group_size`"
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    iinfo = torch.iinfo(dtype)
+    int8_min = iinfo.min
+    int8_max = iinfo.max
+
+    x_ = x.reshape(x.numel() // group_size, group_size)
+    # Use float32 for scale calculation for stability
+    amax = x_.abs().max(dim=-1,
+                        keepdim=True)[0].clamp(min=eps).to(torch.float32)
+    x_s = amax / int8_max
+    x_q = (x_.to(torch.float32) / x_s).round().clamp(
+        min=int8_min, max=int8_max).to(dtype)  # Round before clamping
+    x_q = x_q.reshape(x.shape)
+    x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size, ))
+
+    return x_q, x_s
+
+
+DEFAULT_BLOCK_SHAPE = [128, 128]
+
+
+def per_block_cast_to_fp8(
+    x: torch.Tensor,
+    block_shape: list[int] = DEFAULT_BLOCK_SHAPE,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    block_m, block_n = block_shape
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros((round_up(m, block_m), round_up(n, block_n)),
+                           dtype=x.dtype,
+                           device=x.device)
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
+    x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
+    scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
+    return x_scaled_sub, scales
+
+
+def per_block_cast_to_int8(
+    x: torch.Tensor,
+    block_shape: list[int] = DEFAULT_BLOCK_SHAPE,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    block_m, block_n = block_shape
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros((round_up(m, block_m), round_up(n, block_n)),
+                           dtype=x.dtype,
+                           device=x.device)
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    x_scaled = (x_view * (256.0 / x_amax)).to(torch.int8)
+    x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
+    scales = (x_amax / 256.0).view(x_view.size(0), x_view.size(2))
+    return x_scaled_sub, scales
+
+
+def dequant(
+    t: torch.Tensor,
+    scale: Optional[torch.Tensor],
+    block_shape: Optional[list[int]],
+    per_act_token_quant: bool,
+    out_dtype: Optional[torch.dtype] = torch.float32,
+) -> torch.Tensor:
+    if scale is not None:
+        f32 = torch.float32
+        if per_act_token_quant or block_shape is None:
+            return (t.to(f32) * scale).to(out_dtype)
+        else:
+            return (t.to(f32) * group_broadcast(scale, t.shape)).to(out_dtype)
+    else:
+        return t.to(out_dtype)
+
+
+def native_batched_masked_quant_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    C: torch.Tensor,
+    num_expert_tokens: torch.Tensor,
+    A_scale: Optional[torch.Tensor] = None,
+    B_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[list[int]] = None,
+    per_act_token_quant: bool = False,
+) -> torch.Tensor:
+    num_expert_tokens_cpu = num_expert_tokens.clone()
+    num_expert_tokens_cpu = num_expert_tokens_cpu.to(device="cpu")
+    num_experts = num_expert_tokens.size(0)
+
+    for e in range(num_experts):
+        num_tokens = num_expert_tokens_cpu[e]
+        if A.dtype.itemsize == 1 and block_shape is not None:
+            assert A_scale is not None and B_scale is not None
+            tmp = native_w8a8_block_matmul(A[e], B[e], A_scale[e], B_scale[e],
+                                           block_shape, C.dtype)
+            C[e, :num_tokens, :] = tmp[:num_tokens, :]
+        elif A.dtype.itemsize == 1 and block_shape is None:
+            assert A_scale is not None and B_scale is not None
+            A_dq = dequant(A[e], A_scale[e], block_shape, per_act_token_quant)
+            B_dq = dequant(B[e], B_scale[e], block_shape, per_act_token_quant)
+            C[e, :num_tokens, :] = (
+                A_dq[:num_tokens] @ B_dq.transpose(0, 1)).to(C.dtype)
+        else:
+            assert A_scale is None
+            assert B_scale is None
+            C[e, :num_tokens, :] = A[e, :num_tokens, :] @ B[e].transpose(0, 1)
+
+    return C
diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
index 1ca0a80ab9a99..42d5526dc21f2 100644
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -7,16 +7,10 @@ import itertools
 import pytest
 import torch
 
-from tests.kernels.quant_utils import native_w8a8_block_matmul
-from vllm.config import VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import fused_moe
-from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
-    _valid_deep_gemm_shape, deep_gemm_moe_fp8)
-from vllm.model_executor.layers.fused_moe.fused_moe import (
-    fused_topk, modular_triton_fused_moe)
-from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
-    moe_align_block_size)
+from tests.kernels.quant_utils import (native_per_token_group_quant_fp8,
+                                       native_w8a8_block_matmul,
+                                       per_block_cast_to_fp8)
+from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8, w8a8_block_fp8_matmul)
 from vllm.platforms import current_platform
@@ -46,78 +40,10 @@ N = [128, 512, 7168, 7748, 13824]
 K = [256, 3884, 4096, 13824, 16384]
 # Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
 # and its hidden size is 7168.
-M_moe = [1, 2, 7, 83, 128, 2048, 1024 * 128]
-M_moe_dg = [128, 192, 1335, 2048]
-N_moe = [128, 256, 1024, 4608]  # [13824]
-K_moe = [256, 512, 7168]  # [13824]
 BLOCK_SIZE = [[128, 128]]
-E = [2, 8, 16, 24]  # [128, 256]
-TOP_KS = [1, 2, 6]
 OUT_DTYPES = [torch.bfloat16]  # [torch.float32, torch.half, torch.bfloat16]
 SEEDS = [0]
 
-
-def native_per_token_group_quant_fp8(x,
-                                     group_size,
-                                     eps=1e-10,
-                                     dtype=torch.float8_e4m3fn):
-    """Function to perform per-token-group quantization on an input tensor
-    `x` using native torch."""
-    assert x.shape[-1] % group_size == 0, ("the last dimension of `x` cannot "
-                                           "be divisible by `group_size`")
-    assert x.is_contiguous(), "`x` is not contiguous"
-
-    finfo = torch.finfo(dtype)
-    fp8_min = finfo.min
-    fp8_max = finfo.max
-
-    x_ = x.reshape(x.numel() // group_size, group_size)
-    amax = x_.abs().max(dim=-1,
-                        keepdim=True)[0].clamp(min=eps).to(torch.float32)
-    x_s = amax / fp8_max
-    x_q = (x_ / x_s).clamp(min=fp8_min, max=fp8_max).to(dtype)
-    x_q = x_q.reshape(x.shape)
-    x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size, ))
-
-    return x_q, x_s
-
-
-def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
-    """Fused moe with block-wise quantization using native torch."""
-    B, D = a.shape
-    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
-    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
-    score = torch.softmax(score, dim=-1, dtype=torch.float32)
-    topk_weight, topk_ids = torch.topk(score, topk)
-    topk_weight = topk_weight.view(-1)
-    topk_ids = topk_ids.view(-1)
-
-    _, block_k = block_shape[0], block_shape[1]
-    a_q, a_s = native_per_token_group_quant_fp8(a, block_k)
-    a_q = a_q.to(torch.float32)
-    for i in range(w1.shape[0]):
-        mask = topk_ids == i
-        if mask.sum():
-            inter_out = native_w8a8_block_matmul(a_q[mask],
-                                                 w1[i],
-                                                 a_s[mask],
-                                                 w1_s[i],
-                                                 block_shape,
-                                                 output_dtype=a.dtype)
-            act_out = SiluAndMul().forward_native(inter_out)
-            act_out_q, act_out_s = native_per_token_group_quant_fp8(
-                act_out, block_k)
-            act_out = act_out.to(torch.float32)
-            out[mask] = native_w8a8_block_matmul(act_out_q,
-                                                 w2[i],
-                                                 act_out_s,
-                                                 w2_s[i],
-                                                 block_shape,
-                                                 output_dtype=a.dtype)
-    return (out.view(B, -1, w2.shape[1]) *
-            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
-
-
 # Skip all tests if CUDA is not available
 pytest.importorskip("torch.cuda")
 
@@ -177,111 +103,6 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
     assert rel_diff < 0.001
 
 
-@pytest.mark.parametrize(
-    "M,N,K,E,topk,block_size,dtype,seed",
-    itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, BLOCK_SIZE, DTYPES,
-                      SEEDS))
-@torch.inference_mode()
-def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
-    if topk > E:
-        pytest.skip(f"Skipping test; topk={topk} > E={E}")
-
-    torch.manual_seed(seed)
-    factor_for_scale = 1e-2
-    fp8_info = torch.finfo(torch.float8_e4m3fn)
-    fp8_max, fp8_min = fp8_info.max, fp8_info.min
-
-    a = torch.randn((M, K), dtype=dtype) / 10
-
-    w1_bf16 = (torch.rand(
-        (E, 2 * N, K), dtype=torch.bfloat16) - 0.5) * 2 * fp8_max
-    w1 = w1_bf16.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
-    del w1_bf16
-
-    w2_bf16 = (torch.rand((E, K, N), dtype=torch.bfloat16) - 0.5) * 2 * fp8_max
-    w2 = w2_bf16.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
-    del w2_bf16
-
-    block_n, block_k = block_size[0], block_size[1]
-    n_tiles_w1 = (2 * N + block_n - 1) // block_n
-    n_tiles_w2 = (K + block_n - 1) // block_n
-    k_tiles_w1 = (K + block_k - 1) // block_k
-    k_tiles_w2 = (N + block_k - 1) // block_k
-
-    w1_s = torch.rand(
-        (E, n_tiles_w1, k_tiles_w1), dtype=torch.float32) * factor_for_scale
-    w2_s = torch.rand(
-        (E, n_tiles_w2, k_tiles_w2), dtype=torch.float32) * factor_for_scale
-
-    score = torch.randn((M, E), dtype=dtype)
-
-    m_fused_moe = modular_triton_fused_moe(use_fp8_w8a8=True,
-                                           use_int8_w8a8=False,
-                                           use_int8_w8a16=False,
-                                           use_int4_w4a16=False,
-                                           per_channel_quant=False,
-                                           block_shape=block_size)
-
-    # Set the context to avoid lots of warning spam.
-    with set_current_vllm_config(vllm_config):
-        out = fused_moe(
-            a,
-            w1,
-            w2,
-            score,
-            topk,
-            renormalize=False,
-            use_fp8_w8a8=True,
-            w1_scale=w1_s,
-            w2_scale=w2_s,
-            block_shape=block_size,
-        )
-        ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk,
-                                           block_size)
-
-        topk_weights, topk_ids, _ = fused_topk(a, score, topk, False)
-        m_out = m_fused_moe(a,
-                            w1,
-                            w2,
-                            topk_weights,
-                            topk_ids,
-                            global_num_experts=E,
-                            w1_scale=w1_s,
-                            w2_scale=w2_s)
-
-    #print(f"{out.sum()=}")
-    #print(f"{ref_out.sum()=}")
-
-    rel_diff = (torch.mean(
-        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
-                torch.mean(torch.abs(ref_out.to(torch.float32))))
-    assert rel_diff < 0.03
-
-    rel_diff = (torch.mean(
-        torch.abs(m_out.to(torch.float32) - ref_out.to(torch.float32))) /
-                torch.mean(torch.abs(ref_out.to(torch.float32))))
-    assert rel_diff < 0.03
-
-
-def per_block_cast_to_fp8(
-        x: torch.Tensor,
-        block_size_n: int = 128) -> tuple[torch.Tensor, torch.Tensor]:
-    assert x.dim() == 2
-    m, n = x.shape
-    x_padded = torch.zeros(
-        (deep_gemm.ceil_div(m, 128) * 128,
-         deep_gemm.ceil_div(n, block_size_n) * block_size_n),
-        dtype=x.dtype,
-        device=x.device)
-    x_padded[:m, :n] = x
-    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, block_size_n)
-    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
-    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
-    x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
-    scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
-    return x_scaled_sub, scales
-
-
 @pytest.mark.parametrize(
     "M,N,K,block_size,out_dtype,seed",
     itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
@@ -324,187 +145,3 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
         torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
                 torch.mean(torch.abs(ref_out.to(torch.float32))))
     assert rel_diff < 0.001
-
-
-def fp8_perm(m, idx):
-    if torch.is_floating_point(m) and torch.finfo(m.dtype).bits == 8:
-        return m.view(dtype=torch.uint8)[idx, ...].view(dtype=m.dtype)
-    else:
-        return m[idx, ...]
-
-
-def _moe_permute(a, a_s, topk_ids, num_groups, topk, block_m):
-    M, K = a.shape
-
-    sorted_token_ids, m_indices, num_pad = moe_align_block_size(
-        topk_ids, block_m, num_groups, None, pad_sorted_ids=True)
-
-    num_tokens = topk * M
-
-    sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1)
-    m_indices = torch.repeat_interleave(m_indices, block_m, dim=0)
-    inv_perm = torch.argsort(sorted_token_ids)[:M * topk]
-
-    a = fp8_perm(a, sorted_token_ids // topk)
-    if a_s is not None:
-        a_s = a_s[sorted_token_ids // topk]
-
-    return a, a_s, m_indices, inv_perm
-
-
-def _moe_unpermute(out, inv_perm, topk, K, topk_weight):
-    M = topk_weight.shape[0]
-    out = out[inv_perm, ...]
-    tmp_out = out.view(-1, topk, K)
-    return (tmp_out * topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1)
-
-
-def deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s, score, topk,
-                                 block_shape):
-    """Fused moe with block-wise quantization using DeepGemm grouped gemm."""
-    num_groups = w1.shape[0]
-    M, K = a.shape
-    N = w2.shape[-1]
-
-    topk_weight, topk_ids, token_expert_indices = fused_topk(
-        a, score.float(), topk, False)
-
-    block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
-
-    _, block_k = block_shape[0], block_shape[1]
-
-    a_q, a_s = per_token_group_quant_fp8(a, block_m)
-
-    a_q, a_s, m_indices, inv_perm = _moe_permute(a_q, a_s, topk_ids,
-                                                 num_groups, topk, block_m)
-
-    inter_out = torch.zeros((a_q.shape[0], N * 2),
-                            dtype=torch.bfloat16,
-                            device=a.device)
-
-    deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous((a_q, a_s), (w1, w1_s),
-                                                        inter_out, m_indices)
-
-    act_out = SiluAndMul().forward_native(inter_out)
-    act_out_q, act_out_s = per_token_group_quant_fp8(act_out, block_k)
-
-    out = torch.zeros(a_q.shape[0], K, dtype=torch.bfloat16, device=a.device)
-
-    deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
-        (act_out_q, act_out_s), (w2, w2_s), out, m_indices)
-
-    final_out = _moe_unpermute(out, inv_perm, topk, K, topk_weight)
-
-    return final_out
-
-
-@pytest.mark.parametrize(
-    "M,N,K,E,topk,seed",
-    itertools.product(M_moe_dg, N_moe, K_moe, E, TOP_KS, SEEDS))
-@pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
-@torch.inference_mode()
-def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
-                                            monkeypatch):
-    if topk > E:
-        pytest.skip(f"Skipping test: topk={topk} > E={E}")
-
-    if not _valid_deep_gemm_shape(M, N, K):
-        pytest.skip(f"Skipping test: invalid size m={M}, n={N}, k={K}")
-
-    chunk_size = 1024
-
-    torch.manual_seed(seed)
-
-    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size))
-
-    block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
-    block_size = [block_m, block_m]
-    dtype = torch.bfloat16
-
-    fp8_info = torch.finfo(torch.float8_e4m3fn)
-    fp8_max, fp8_min = fp8_info.max, fp8_info.min
-
-    a = torch.randn((M, K), dtype=dtype) / 10
-
-    w1_bf16 = ((torch.rand((E, 2 * N, K), dtype=torch.bfloat16) - 0.5) * 2 *
-               fp8_max).clamp(min=fp8_min, max=fp8_max)
-
-    w2_bf16 = ((torch.rand((E, K, N), dtype=torch.bfloat16) - 0.5) * 2 *
-               fp8_max).clamp(min=fp8_min, max=fp8_max)
-
-    score = torch.randn((M, E), dtype=dtype)
-
-    block_n, block_k = block_size[0], block_size[1]
-    n_tiles_w1 = ((2 * N) + block_n - 1) // block_n
-    k_tiles_w1 = (K + block_k - 1) // block_k
-    n_tiles_w2 = (K + block_n - 1) // block_n
-    k_tiles_w2 = (N + block_k - 1) // block_k
-
-    w1 = torch.empty_like(w1_bf16, dtype=torch.float8_e4m3fn)
-    w2 = torch.empty_like(w2_bf16, dtype=torch.float8_e4m3fn)
-
-    w1_s = torch.empty((E, n_tiles_w1, k_tiles_w1), dtype=torch.float32)
-    w2_s = torch.empty((E, n_tiles_w2, k_tiles_w2), dtype=torch.float32)
-
-    w1_s = deep_gemm.get_col_major_tma_aligned_tensor(w1_s).contiguous()
-    w2_s = deep_gemm.get_col_major_tma_aligned_tensor(w2_s).contiguous()
-
-    assert w1_s.shape == (E, (2 * N + 127) // 128, (K + 127) // 128)
-    assert (w2.shape[-2] + block_n - 1) // block_n == w2_s.shape[-2]
-
-    for i in range(E):
-        w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i])
-        w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i])
-
-    # Note: for now use_compile will error out if the problem size is
-    # large enough to trigger chunking. I'm leaving the flag and
-    # setup code in case we are able to revisit this later.
-    use_compile = False
-
-    use_cudagraph = (chunk_size < M and N >= 1024 and K >= 1024
-                     and current_platform.is_cuda_alike())
-
-    # Set the context to avoid lots of warning spam.
-    with set_current_vllm_config(vllm_config):
-        if M >= 128:
-            ref_out = deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s,
-                                                   score, topk, block_size)
-        else:
-            ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score,
-                                               topk, block_size)
-
-        topk_weights, topk_ids, token_expert_indices = fused_topk(
-            a, score.float(), topk, False)
-
-        if use_compile:
-            deep_gemm_moe_fp8_fn = torch.compile(deep_gemm_moe_fp8,
-                                                 backend="inductor",
-                                                 fullgraph=True)
-            torch._dynamo.mark_dynamic(a, 0)
-            torch._dynamo.mark_dynamic(topk_weights, 0)
-            torch._dynamo.mark_dynamic(topk_ids, 0)
-        else:
-            deep_gemm_moe_fp8_fn = deep_gemm_moe_fp8
-
-        out = deep_gemm_moe_fp8_fn(a, w1, w2, w1_s, w2_s, topk_weights,
-                                   topk_ids)
-
-        if use_cudagraph:
-            out.fill_(0)
-            stream = torch.cuda.Stream()
-            graph = torch.cuda.CUDAGraph()
-            with torch.cuda.graph(graph, stream=stream):
-                out = deep_gemm_moe_fp8_fn(a, w1, w2, w1_s, w2_s, topk_weights,
-                                           topk_ids)
-            torch.cuda.synchronize()
-            graph.replay()
-            torch.cuda.synchronize()
-
-    #print(f"{out.sum()=}")
-    #print(f"{ref_out.sum()=}")
-
-    rel_diff = (torch.mean(
-        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
-                torch.mean(torch.abs(ref_out.to(torch.float32))))
-
-    assert rel_diff < 0.03
diff --git a/tests/kernels/quantization/test_block_int8.py b/tests/kernels/quantization/test_block_int8.py
index fa2c9f890d6fb..fac82cf9c8b5e 100644
--- a/tests/kernels/quantization/test_block_int8.py
+++ b/tests/kernels/quantization/test_block_int8.py
@@ -8,9 +8,7 @@ import pytest
 import torch
 
 from tests.kernels.quant_utils import native_w8a8_block_matmul
-from vllm.config import VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization.utils.int8_utils import (
     w8a8_block_int8_matmul)
 from vllm.platforms import current_platform
@@ -23,82 +21,10 @@ vllm_config = VllmConfig()
 vllm_config.scheduler_config.max_num_seqs = 128
 vllm_config.scheduler_config.max_model_len = 8192
 
-
-# For test
-def native_per_token_group_quant_int8(x,
-                                      group_size,
-                                      eps=1e-10,
-                                      dtype=torch.int8):
-    """Function to perform per-token-group quantization on an input tensor
-    `x` using native torch.
-
-    It converts the tensor values into int8 values and returns the
-    quantized tensor along with the scaling factor used for quantization.
-    """
-    assert (x.shape[-1] % group_size == 0
-            ), "the last dimension of `x` cannot be divisible by `group_size`"
-    assert x.is_contiguous(), "`x` is not contiguous"
-
-    iinfo = torch.iinfo(dtype)
-    int8_min = iinfo.min
-    int8_max = iinfo.max
-
-    x_ = x.reshape(x.numel() // group_size, group_size)
-    # Use float32 for scale calculation for stability
-    amax = x_.abs().max(dim=-1,
-                        keepdim=True)[0].clamp(min=eps).to(torch.float32)
-    x_s = amax / int8_max
-    x_q = (x_.to(torch.float32) / x_s).round().clamp(
-        min=int8_min, max=int8_max).to(dtype)  # Round before clamping
-    x_q = x_q.reshape(x.shape)
-    x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size, ))
-
-    return x_q, x_s
-
-
-# For test
-def torch_w8a8_block_int8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
-    """This function performs fused moe with block-wise quantization using
-    native torch."""
-    B, D = a.shape
-    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
-    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
-    score = torch.softmax(score, dim=-1, dtype=torch.float32)
-    topk_weight, topk_ids = torch.topk(score, topk)
-    topk_weight = topk_weight.view(-1)
-    topk_ids = topk_ids.view(-1)
-
-    _, block_k = block_shape[0], block_shape[1]
-    a_q, a_s = native_per_token_group_quant_int8(a, block_k)
-    for i in range(w1.shape[0]):
-        mask = topk_ids == i
-        if mask.sum():
-            inter_out = native_w8a8_block_matmul(a_q[mask],
-                                                 w1[i],
-                                                 a_s[mask],
-                                                 w1_s[i],
-                                                 block_shape,
-                                                 output_dtype=a.dtype)
-            act_out = SiluAndMul().forward_native(inter_out)
-            act_out_q, act_out_s = native_per_token_group_quant_int8(
-                act_out, block_k)
-            act_out = act_out.to(torch.float32)
-            out[mask] = native_w8a8_block_matmul(act_out_q,
-                                                 w2[i],
-                                                 act_out_s,
-                                                 w2_s[i],
-                                                 block_shape,
-                                                 output_dtype=a.dtype)
-    return (out.view(B, -1, w2.shape[1]) *
-            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
-
-
 DTYPES = [torch.half, torch.bfloat16]
 M = [1, 33, 64, 222]
 N = [128, 1024]
 K = [256, 4096]
-E = [8, 24]
-TOP_KS = [2, 6]
 # BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
 BLOCK_SIZE = [[128, 128]]
 SEEDS = [0]
@@ -140,63 +66,3 @@ def test_w8a8_block_int8_matmul(M, N, K, block_size, out_dtype, seed):
         torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
                 torch.mean(torch.abs(ref_out.to(torch.float32))))
     assert rel_diff < 0.001
-
-
-@pytest.mark.parametrize(
-    "M, N, K, E, topk, block_size, dtype, seed",
-    itertools.product(M, N, K, E, TOP_KS, BLOCK_SIZE, DTYPES, SEEDS))
-@torch.inference_mode()
-def test_w8a8_block_int8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
-    """Tests the fused_moe kernel with W8A8 INT8 block quantization against a
-    native torch reference."""
-    torch.manual_seed(seed)
-    # Use a smaller factor for scale initialization to prevent large
-    # values/overflow especially when output dtype might be float16
-    factor_for_scale = 1e-2
-    int8_info = torch.iinfo(torch.int8)
-    int8_max, int8_min = int8_info.max, int8_info.min
-
-    a = torch.randn((M, K), dtype=dtype) / 10
-
-    w1_fp32 = (torch.rand(
-        (E, 2 * N, K), dtype=torch.float32) - 0.5) * 2 * int8_max
-    w1 = w1_fp32.clamp(min=int8_min, max=int8_max).to(torch.int8)
-
-    w2_fp32 = (torch.rand((E, K, N), dtype=torch.float32) - 0.5) * 2 * int8_max
-    w2 = w2_fp32.clamp(min=int8_min, max=int8_max).to(torch.int8)
-
-    block_n, block_k = block_size[0], block_size[1]
-    n_tiles_w1 = (2 * N + block_n - 1) // block_n
-    n_tiles_w2 = (K + block_n - 1) // block_n
-    k_tiles_w1 = (K + block_k - 1) // block_k
-    k_tiles_w2 = (N + block_k - 1) // block_k
-
-    w1_s = (torch.rand(
-        (E, n_tiles_w1, k_tiles_w1), dtype=torch.float32) * factor_for_scale)
-    w2_s = (torch.rand(
-        (E, n_tiles_w2, k_tiles_w2), dtype=torch.float32) * factor_for_scale)
-
-    score = torch.randn((M, E), dtype=dtype)
-
-    # Set the context to avoid lots of warning spam.
-    with set_current_vllm_config(vllm_config):
-        out = fused_moe(
-            a,
-            w1,
-            w2,
-            score,
-            topk,
-            renormalize=False,
-            use_int8_w8a8=True,
-            w1_scale=w1_s,
-            w2_scale=w2_s,
-            block_shape=block_size,
-        )
-        ref_out = torch_w8a8_block_int8_moe(a, w1, w2, w1_s, w2_s, score, topk,
-                                            block_size)
-
-    # Check results
-    rel_diff = (torch.mean(
-        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
-                torch.mean(torch.abs(ref_out.to(torch.float32))))
-    assert rel_diff < 0.06
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index dcda8e479b290..84cf87d71d881 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -13,8 +13,11 @@ import pytest
 import torch
 from torch._prims_common import TensorLikeType
 
+from tests.kernels.quant_utils import native_w8a8_block_matmul
 from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
 from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe.utils import (
+    moe_kernel_quantize_input)
 from vllm.platforms.interface import _Backend
 from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL,
                         STR_XFORMERS_ATTN_VAL, make_tensor_with_pad)
@@ -1054,32 +1057,77 @@ def compute_max_diff(output, output_ref):
         torch.abs(output_ref))
 
 
-def torch_experts(a: torch.Tensor,
-                  w1: torch.Tensor,
-                  w2: torch.Tensor,
-                  topk_weight: torch.Tensor,
-                  topk_ids: torch.Tensor,
-                  global_num_experts: int = -1,
-                  expert_map: Optional[torch.Tensor] = None) -> torch.Tensor:
+def torch_experts(
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    quant_dtype: Optional[torch.dtype] = None,
+    per_act_token_quant=False,
+    block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
     assert (global_num_experts == -1
             or (global_num_experts == w1.shape[0] and expert_map is None)
             or (expert_map is not None
                 and global_num_experts == expert_map.shape[0]))
+
+    M, K = a.shape
     topk = topk_ids.shape[1]
-    B, D = a.shape
-    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
-    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
-    topk_weight = topk_weight.view(-1)
+
+    a = a.view(M, -1, K).repeat(1, topk, 1).reshape(-1, K)
+
+    out = torch.zeros(M * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+
+    a, a_scale = moe_kernel_quantize_input(a, None, quant_dtype,
+                                           per_act_token_quant, block_shape)
+
+    num_experts = w1.shape[0]
+
     topk_ids = topk_ids.view(-1)
     if expert_map is not None:
         topk_ids = expert_map[topk_ids]
-    for i in range(w1.shape[0]):
+
+    for i in range(num_experts):
         mask = topk_ids == i
         if mask.sum():
-            out[mask] = SiluAndMul()(
-                a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1)
-    return (out.view(B, -1, w2.shape[1]) *
-            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+            if quant_dtype is None:
+                tmp1 = a[mask] @ w1[i].transpose(0, 1)
+                tmp2 = SiluAndMul()(tmp1)
+                out[mask] = tmp2 @ w2[i].transpose(0, 1)
+            elif block_shape is not None:
+                assert (a_scale is not None and w1_scale is not None
+                        and w2_scale is not None)
+                tmp1 = native_w8a8_block_matmul(a[mask], w1[i], a_scale[mask],
+                                                w1_scale[i], block_shape,
+                                                out.dtype)
+                tmp2 = SiluAndMul()(tmp1)
+                tmp2, b_scale = moe_kernel_quantize_input(
+                    tmp2, None, quant_dtype, per_act_token_quant, block_shape)
+
+                out[mask] = native_w8a8_block_matmul(tmp2, w2[i], b_scale,
+                                                     w2_scale[i], block_shape,
+                                                     out.dtype)
+            else:
+                assert (a_scale is not None and w1_scale is not None
+                        and w2_scale is not None)
+                f32 = torch.float32
+                scales = a_scale if a_scale.numel() == 1 else a_scale[mask]
+                tmp1 = a[mask].to(f32) * scales
+                w1_dq = (w1[i].to(f32) * w1_scale[i]).transpose(0, 1)
+                tmp1 = tmp1 @ w1_dq
+                tmp2 = SiluAndMul()(tmp1)
+                w2_dq = (w2[i].to(f32) * w2_scale[i]).transpose(0, 1)
+                out[mask] = (tmp2 @ w2_dq).to(out.dtype)
+
+    return (out.view(M, -1, w2.shape[1]) *
+            topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1)
 
 
 def torch_moe(a: torch.Tensor,
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 36a0395ccdc93..6b1b3f787c23e 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1274,7 +1274,7 @@ def scaled_fp8_quant(
             scale = torch.zeros(1, device=input.device, dtype=torch.float32)
             torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
     else:
-        assert scale.numel() == 1
+        assert scale.numel() == 1, f"{scale.shape}"
         torch.ops._C.static_scaled_fp8_quant(output, input, scale)
 
     return output, scale
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 2bdc96e297c1f..3d40879b4ccbf 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -4,8 +4,12 @@
 from contextlib import contextmanager
 from typing import Any, Optional
 
+from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEActivationFormat, FusedMoEPermuteExpertsUnpermute,
+    FusedMoEPrepareAndFinalize)
 from vllm.triton_utils import HAS_TRITON
 
 _config: Optional[dict[str, Any]] = None
@@ -26,8 +30,12 @@ def get_config() -> Optional[dict[str, Any]]:
 
 __all__ = [
     "FusedMoE",
+    "FusedMoEConfig",
     "FusedMoEMethodBase",
     "FusedMoeWeightScaleSupported",
+    "FusedMoEPermuteExpertsUnpermute",
+    "FusedMoEActivationFormat",
+    "FusedMoEPrepareAndFinalize",
     "override_config",
     "get_config",
 ]
@@ -36,11 +44,21 @@ if HAS_TRITON:
     # import to register the custom ops
     import vllm.model_executor.layers.fused_moe.fused_marlin_moe  # noqa
     import vllm.model_executor.layers.fused_moe.fused_moe  # noqa
+    from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
+        BatchedDeepGemmExperts)
+    from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
+        BatchedTritonOrDeepGemmExperts)
     from vllm.model_executor.layers.fused_moe.cutlass_moe import (
-        cutlass_moe_fp4, cutlass_moe_fp8)
+        CutlassExpertsFp8, cutlass_moe_fp4, cutlass_moe_fp8)
+    from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+        DeepGemmExperts)
+    from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+        BatchedTritonExperts)
     from vllm.model_executor.layers.fused_moe.fused_moe import (
         TritonExperts, fused_experts, fused_moe, fused_topk,
         get_config_file_name, grouped_topk)
+    from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
+        TritonOrDeepGemmExperts)
 
     __all__ += [
         "fused_moe",
@@ -50,5 +68,11 @@ if HAS_TRITON:
         "grouped_topk",
         "cutlass_moe_fp8",
         "cutlass_moe_fp4",
+        "CutlassExpertsFp8",
         "TritonExperts",
+        "BatchedTritonExperts",
+        "DeepGemmExperts",
+        "BatchedDeepGemmExperts",
+        "TritonOrDeepGemmExperts",
+        "BatchedTritonOrDeepGemmExperts",
     ]
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
index b54ac80535a42..6b08f32dff186 100644
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -5,6 +5,7 @@ import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 from vllm.triton_utils import tl, triton
 
@@ -179,28 +180,44 @@ def silu_mul_fp8_quant_deep_gemm(
 class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     # The Deep Gemm kernels only support block size of 128
-    DEEPGEMM_BLOCK_SHAPE = 128
+    DEEPGEMM_BLOCK_SHAPE: list[int] = [128, 128]
 
-    def __init__(self, max_num_tokens: int, world_size: int, dp_size: int,
-                 block_shape: list[int]):
+    def __init__(self,
+                 max_num_tokens: int,
+                 world_size: int,
+                 dp_size: int,
+                 block_shape: list[int],
+                 per_act_token_quant=False):
         """
         max_num_tokens: Maximum number of tokens from a DP Rank
         world_size: Number of EP ranks
         dp_size: Number of data-parallel ranks
         block_shape: Block quantization block shape
         """
-        super().__init__()
+        super().__init__(
+            FusedMoEQuantConfig(
+                quant_dtype=torch.float8_e4m3fn,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+            ))
+        assert self.block_shape == self.DEEPGEMM_BLOCK_SHAPE
         self.max_num_tokens = max_num_tokens
         self.world_size = world_size
         self.dp_size = dp_size
-        self.block_shape = block_shape
 
-        assert (len(self.block_shape) == 2 and all(
-            [v == self.DEEPGEMM_BLOCK_SHAPE for v in self.block_shape]))
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (mk.FusedMoEActivationFormat.BatchedExperts,
+                mk.FusedMoEActivationFormat.BatchedExperts)
 
     def supports_chunking(self) -> bool:
         return False
 
+    def supports_expert_map(self) -> bool:
+        return False
+
     def workspace_shapes(
         self,
         a: torch.Tensor,
@@ -248,6 +265,7 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
     ):
         import deep_gemm as dg
         assert hidden_states.ndim == 3
+        assert self.block_shape is not None
 
         a1q = hidden_states
         _, N, K = w1.size()
diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
index 822cda8205bfe..3682a536cb5ca 100644
--- a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
@@ -6,6 +6,7 @@ import torch
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
     BatchedDeepGemmExperts)
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
     BatchedTritonExperts)
 
@@ -20,43 +21,45 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
                  use_int8_w8a8: bool = False,
                  use_int8_w8a16: bool = False,
                  use_int4_w4a16: bool = False,
-                 per_channel_quant: bool = False,
                  block_shape: Optional[list[int]] = None,
+                 per_act_token_quant: bool = False,
                  allow_deep_gemm: bool = False):
-        super().__init__()
         assert not use_int8_w8a8, "NYI"
         assert not use_int8_w8a16, "NYI"
         assert not use_int4_w4a16, "NYI"
 
+        super().__init__(
+            FusedMoEQuantConfig.make(
+                use_fp8_w8a8=use_fp8_w8a8,
+                use_int8_w8a8=use_int8_w8a8,
+                use_int8_w8a16=use_int8_w8a16,
+                use_int4_w4a16=use_int4_w4a16,
+                block_shape=block_shape,
+                per_act_token_quant=per_act_token_quant,
+            ))
         self.max_num_tokens = max_num_tokens
         self.world_size = world_size
         self.dp_size = dp_size
-        self.use_fp8_w8a8 = use_fp8_w8a8
-        self.use_int8_w8a8 = use_int8_w8a8
-        self.use_int8_w8a16 = use_int8_w8a16
-        self.use_int4_w4a16 = use_int4_w4a16
-        self.per_channel_quant = per_channel_quant
-        self.block_shape = block_shape
         self.allow_deep_gemm = allow_deep_gemm
 
         # BatchedTritonKernel doesn't support block quantization
         # at the moment.
         self.batched_triton_experts = BatchedTritonExperts(
             max_num_tokens=self.max_num_tokens,
-            use_fp8_w8a8=self.use_fp8_w8a8,
-            use_int8_w8a8=self.use_int8_w8a8,
-            use_int8_w8a16=self.use_int8_w8a16,
-            use_int4_w4a16=self.use_int4_w4a16,
-            per_channel_quant=self.per_channel_quant,
-            block_shape=self.block_shape,
             world_size=self.world_size,
-            dp_size=self.dp_size) if self.block_shape is None else None
+            dp_size=self.dp_size,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            use_int4_w4a16=use_int4_w4a16,
+            per_act_token_quant=self.per_act_token_quant,
+            block_shape=self.block_shape,
+        ) if self.block_shape is None else None
+
+        is_fp8_128_block_quantized = (
+            use_fp8_w8a8 and self.block_shape
+            == BatchedDeepGemmExperts.DEEPGEMM_BLOCK_SHAPE)
 
-        is_fp8_128_block_quantized = (self.use_fp8_w8a8
-                                      and self.block_shape is not None
-                                      and len(self.block_shape) == 2 and all(
-                                          [b == 128
-                                           for b in self.block_shape]))
         self.batched_deep_gemm_experts = BatchedDeepGemmExperts(
             max_num_tokens=self.max_num_tokens,
             world_size=self.world_size,
@@ -67,12 +70,31 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         assert (self.batched_deep_gemm_experts is not None
                 or self.batched_triton_experts is not None)
 
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        if self.batched_triton_experts is not None:
+            assert (self.batched_deep_gemm_experts is None
+                    or self.batched_deep_gemm_experts.activation_formats
+                    == self.batched_triton_experts.activation_formats)
+            return self.batched_triton_experts.activation_formats
+        else:
+            assert self.batched_deep_gemm_experts is not None
+            return self.batched_deep_gemm_experts.activation_formats
+
     def supports_chunking(self) -> bool:
         bdge = self.batched_deep_gemm_experts
         bte = self.batched_triton_experts
         return ((bdge is None or bdge.supports_chunking())
                 and (bte is None or bte.supports_chunking()))
 
+    def supports_expert_map(self) -> bool:
+        bdge = self.batched_deep_gemm_experts
+        bte = self.batched_triton_experts
+        return ((bdge is None or bdge.supports_expert_map())
+                and (bte is None or bte.supports_expert_map()))
+
     def workspace_shapes(
         self,
         a: torch.Tensor,
@@ -87,7 +109,8 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         # Note: the deep gemm workspaces are strictly larger than the triton
         # workspaces so we can be pessimistic here and allocate for DeepGemm
         # even if we fall back to triton later, e.g. if expert maps are set.
-        if self.allow_deep_gemm and self.batched_deep_gemm_experts is not None:
+        if self.allow_deep_gemm:
+            assert self.batched_deep_gemm_experts is not None
             return self.batched_deep_gemm_experts.workspace_shapes(
                 a, aq, M, N, K, topk, global_num_experts, local_num_experts)
         else:
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
new file mode 100644
index 0000000000000..9a678406b8f3c
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -0,0 +1,410 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from compressed_tensors.quantization import (QuantizationArgs,
+                                             QuantizationStrategy,
+                                             QuantizationType)
+
+import vllm.envs as envs
+from vllm.config import ParallelConfig
+from vllm.distributed import get_dp_group, get_tensor_model_parallel_rank
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+
+logger = init_logger(__name__)
+
+
+def _get_quant_config_quantization_args(
+    quant_config: Optional[QuantizationConfig],
+    prop_name: str,
+) -> Optional[QuantizationArgs]:
+    if (quant_config is not None and hasattr(quant_config, 'target_scheme_map')
+            and "Linear" in quant_config.target_scheme_map and
+            "input_activations" in quant_config.target_scheme_map["Linear"]):
+        return quant_config.target_scheme_map["Linear"].get(prop_name)
+    else:
+        return None
+
+
+def get_quant_config_input_quant(
+        quant_config: Optional[QuantizationConfig]
+) -> Optional[QuantizationArgs]:
+    return _get_quant_config_quantization_args(quant_config,
+                                               "input_activations")
+
+
+def get_quant_config_weight_quant(
+        quant_config: Optional[QuantizationConfig]
+) -> Optional[QuantizationArgs]:
+    return _get_quant_config_quantization_args(quant_config, "weights")
+
+
+# TODO (bnell): use scalar_type instead of bools?
+def get_config_quant_dtype(
+    use_fp8_w8a8: bool,
+    use_int8_w8a8: bool,
+    use_int8_w8a16: bool,
+    use_int4_w4a16: bool,
+) -> Optional[torch.dtype]:
+    if use_fp8_w8a8:
+        return torch.float8_e4m3fn
+    elif use_int8_w8a8:
+        return torch.int8
+    return None
+
+
+@dataclass
+class FusedMoEQuantConfig:
+    # The post quantization activation type.
+    quant_dtype: Optional[torch.dtype] = None
+    per_act_token_quant: bool = False
+    per_out_ch_quant: bool = False
+    block_shape: Optional[list[int]] = None
+
+    # TODO: add col major flag?
+    # add detailed quant info for input, intermediates, weights, etc?
+
+    @staticmethod
+    def make(
+        use_fp8_w8a8: bool = False,
+        use_int8_w8a8: bool = False,
+        use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
+        per_act_token_quant: bool = False,
+        per_out_ch_quant: bool = False,
+        block_shape: Optional[list[int]] = None,
+    ) -> "FusedMoEQuantConfig":
+        assert sum([
+            int(flag) for flag in [
+                use_fp8_w8a8,
+                use_int8_w8a8,
+                use_int8_w8a16,
+                use_int4_w4a16,
+            ]
+        ]) <= 1, "Quantization flags are mutually exclusive."
+
+        quant_dtype = get_config_quant_dtype(
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            use_int4_w4a16=use_int4_w4a16,
+        )
+        return FusedMoEQuantConfig(
+            quant_dtype,
+            per_act_token_quant,
+            per_out_ch_quant,
+            block_shape,
+        )
+
+
+@dataclass
+class FusedMoEParallelConfig:
+    tp_size: int
+    dp_size: int
+    ep_size: int
+    tp_rank: int
+    dp_rank: int
+    ep_rank: int
+    world_size: int
+
+    use_ep: bool  # whether to use EP or not
+
+    @property
+    def use_all2all_kernels(self):
+        return self.dp_size > 1 and self.use_ep
+
+    @property
+    def use_pplx_kernels(self):
+        return (self.use_all2all_kernels
+                and envs.VLLM_ALL2ALL_BACKEND == "pplx")
+
+    @property
+    def use_deepep_ht_kernels(self):
+        return (self.use_all2all_kernels
+                and envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput")
+
+    @property
+    def use_deepep_ll_kernels(self):
+        return (self.use_all2all_kernels
+                and envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency")
+
+    @staticmethod
+    def make(tp_size_: int, dp_size_: int, world_size_: int,
+             vllm_parallel_config: ParallelConfig) -> "FusedMoEParallelConfig":
+        """
+        Determine MoE parallel configuration. Based on the input tp_size_,
+        dp_size_, ep_size_ and vllm's parallel config, determine what
+        level's of parallelism to use in the fused moe layer.
+
+        Args:
+            tp_size_ (int): tp_size passed into the FusedMoE constructor.
+            dp_size_ (int): dp_size passed into the FusedMoE constructor.
+            ep_size_ (int): ep_size passed into the FusedMoE constructor.
+            world_size_ (int): the world size of the current All2All manager.
+            vllm_parallel_config (ParallelConfig): vllm's parallel config
+            object.
+
+        Examples:
+        When there is no parallelism requested, i.e. tp_size_ = dp_size_ = 1,
+        we simply return the sizes unaltered and the ranks set to 0.
+
+        Expert Parallelism is considered only when either dp_size_ or tp_size_
+        is non trivial.
+
+        When TP = 2, DP = 1 and EP = False, the configuration on different
+        devices,
+            - device 0 : TP = {2, 0} DP = {1, 0} EP = {1, 0} //
+                         legend : {size, rank}
+            - device 1 : TP = {2, 1} DP = {1, 0} EP = {1, 0}
+            - Comment : Tensors are sharded across 2 devices.
+
+        When TP = 1, DP = 2 and EP = False, the configuration on different
+        devices,
+            - device 0 : TP = {2, 0} DP = {2, 0} EP = {1, 0}
+            - device 1 : TP = {2, 1} DP = {2, 1} EP = {1, 0}
+            - Comment: There are 2 engine instances and the tensors are sharded
+              across 2 decvices.
+
+        When TP = 2, DP = 2 and EP = False, the configuration on different
+        devices,
+            - device 0: TP = {4, 0} DP = {2, 0} EP = {1, 0}
+            - device 1: TP = {4, 1} DP = {2, 0} EP = {1, 0}
+            - device 2: TP = {4, 2} DP = {2, 1} EP = {1, 0}
+            - device 3: TP = {4, 3} DP = {2, 1} EP = {1, 0}
+            - Comment: There are 2 engine instances and the tensors are sharded
+              across 4 devices.
+
+        When, TP = 2, DP = 1 and EP = True, the configuration on different
+        devices,
+            - device 0: TP = {1, 0} DP = {1, 0} EP = {2, 0}
+            - device 1: TP = {1, 0} DP = {1, 0} EP = {2, 1}
+            - Comment: The experts are split between the 2 devices.
+
+        When, TP = 1, DP = 2 and EP = True, the configuration on different
+        devices,
+            - device 0: TP = {1, 0} DP = {2, 0} EP = {2, 0}
+            - device 1: TP = {1, 0} DP = {2, 1} EP = {2, 1}
+            - Comment: There are 2 engine instances and the experts are split
+              between the 2 devices.
+
+        When TP = 2, DP = 2 and EP = True, the configuration on different
+        devices,
+            - device 0: TP = {1, 0} DP = {2, 0} EP = {4, 0}
+            - device 1: TP = {1, 0} DP = {2, 0} EP = {4, 1}
+            - device 2: TP = {1, 0} DP = {2, 1} EP = {4, 2}
+            - device 3: TP = {1, 0} DP = {2, 1} EP = {4, 3}
+            - Comment: There are 2 engine instances and the experts are split
+              between the 4 devices.
+        """
+
+        def flatten_tp_across_dp(dp_rank: int):
+            tp_rank = 0 if tp_size_ == 1 else get_tensor_model_parallel_rank()
+            # There are actually dp_size_ * tp_size_ devices. Update tp_size
+            # and tp_rank so we shard across all devices.
+            tp_size = dp_size_ * tp_size_
+            tp_rank = dp_rank * tp_size_ + tp_rank
+            return tp_size, tp_rank
+
+        use_ep = (dp_size_ * tp_size_ > 1
+                  and vllm_parallel_config.enable_expert_parallel)
+
+        dp_size = dp_size_
+        dp_rank = get_dp_group().rank_in_group if dp_size > 1 else 0
+        tp_size, tp_rank = flatten_tp_across_dp(dp_rank)
+
+        if not use_ep:
+            return FusedMoEParallelConfig(tp_size=tp_size,
+                                          tp_rank=tp_rank,
+                                          dp_size=dp_size,
+                                          dp_rank=dp_rank,
+                                          ep_size=1,
+                                          ep_rank=0,
+                                          world_size=world_size_,
+                                          use_ep=False)
+        # DP + EP / TP + EP / DP + TP + EP
+        assert use_ep
+        # In EP, each device owns a set of experts fully. There is no tensor
+        # parallel update tp_size, tp_rank, ep_size and ep_rank to reflect that.
+        ep_size = tp_size
+        ep_rank = tp_rank
+        return FusedMoEParallelConfig(tp_size=1,
+                                      tp_rank=0,
+                                      dp_size=dp_size,
+                                      dp_rank=dp_rank,
+                                      ep_size=ep_size,
+                                      ep_rank=ep_rank,
+                                      world_size=world_size_,
+                                      use_ep=True)
+
+
+# Adapted from pplx-kernels tests/all_to_all_utils.py
+@dataclass
+class FusedMoEConfig:
+    num_experts: int
+    experts_per_token: int
+    hidden_dim: int
+
+    num_local_experts: int
+    moe_parallel_config: FusedMoEParallelConfig
+
+    # The activation type.
+    in_dtype: torch.dtype
+
+    quant_config: Optional[FusedMoEQuantConfig] = None
+
+    max_num_tokens: int = envs.VLLM_MOE_DP_CHUNK_SIZE
+
+    def __post_init__(self):
+        if self.dp_size > 1:
+            logger.debug("Using FusedMoEConfig::max_num_tokens=%d",
+                         self.max_num_tokens)
+
+    @property
+    def quant_dtype(self) -> Optional[torch.dtype]:
+        if self.quant_config is not None:
+            return self.quant_config.quant_dtype
+        else:
+            return None
+
+    @property
+    def block_shape(self) -> Optional[list[int]]:
+        if self.quant_config is not None:
+            return self.quant_config.block_shape
+        else:
+            return None
+
+    @property
+    def per_act_token_quant(self) -> bool:
+        if self.quant_config is not None:
+            return self.quant_config.per_act_token_quant
+        else:
+            return False
+
+    @property
+    def per_out_ch_quant(self) -> bool:
+        if self.quant_config is not None:
+            return self.quant_config.per_out_ch_quant
+        else:
+            return False
+
+    @property
+    def tp_size(self):
+        return self.moe_parallel_config.tp_size
+
+    @property
+    def dp_size(self):
+        return self.moe_parallel_config.dp_size
+
+    @property
+    def ep_size(self):
+        return self.moe_parallel_config.ep_size
+
+    @property
+    def world_size(self):
+        return self.moe_parallel_config.world_size
+
+    @property
+    def tp_rank(self):
+        return self.moe_parallel_config.tp_rank
+
+    @property
+    def dp_rank(self):
+        return self.moe_parallel_config.dp_rank
+
+    @property
+    def ep_rank(self):
+        return self.moe_parallel_config.ep_rank
+
+    @property
+    def use_ep(self):
+        return self.moe_parallel_config.use_ep
+
+    @property
+    def use_pplx_kernels(self):
+        return self.moe_parallel_config.use_pplx_kernels
+
+    @property
+    def use_deepep_ht_kernels(self):
+        return self.moe_parallel_config.use_deepep_ht_kernels
+
+    @property
+    def use_deepep_ll_kernels(self):
+        return self.moe_parallel_config.use_deepep_ll_kernels
+
+    @staticmethod
+    def make(
+        num_experts: int,
+        experts_per_token: int,
+        hidden_dim: int,
+        num_local_experts: int,
+        moe_parallel_config: FusedMoEParallelConfig,
+        in_dtype: torch.dtype,
+        max_num_tokens: int = envs.VLLM_MOE_DP_CHUNK_SIZE,
+        quant_config: Optional[Union[FusedMoEQuantConfig,
+                                     QuantizationConfig]] = None
+    ) -> "FusedMoEConfig":
+
+        _quant_config: Optional[FusedMoEQuantConfig] = None
+
+        if quant_config is not None and isinstance(quant_config,
+                                                   QuantizationConfig):
+            if hasattr(quant_config, 'weight_block_size'):
+                block_shape = quant_config.weight_block_size
+            else:
+                block_shape = None
+            per_act_token_quant = False
+            per_out_ch_quant = False
+            quant_dtype: Optional[torch.dtype] = None
+
+            input_quant = get_quant_config_input_quant(quant_config)
+            weight_quant = get_quant_config_weight_quant(quant_config)
+
+            if input_quant is not None:
+                per_act_token_quant = (input_quant.strategy
+                                       == QuantizationStrategy.TOKEN
+                                       if input_quant is not None else False)
+
+                if input_quant.num_bits == 8:
+                    if input_quant.type == QuantizationType.FLOAT:
+                        quant_dtype = torch.float8_e4m3fn
+                    elif input_quant.type == QuantizationType.INT:
+                        quant_dtype = torch.int8
+
+            from vllm.model_executor.layers.quantization.fp8 import Fp8Config
+            if quant_dtype is None and isinstance(quant_config, Fp8Config):
+                quant_dtype = torch.float8_e4m3fn
+
+            if weight_quant is not None:
+                per_out_ch_quant = (
+                    weight_quant.strategy == QuantizationStrategy.CHANNEL)
+
+            if quant_dtype is not None:
+                _quant_config = FusedMoEQuantConfig(
+                    quant_dtype=quant_dtype,
+                    per_act_token_quant=per_act_token_quant,
+                    per_out_ch_quant=per_out_ch_quant,
+                    block_shape=block_shape,
+                )
+            else:
+                _quant_config = FusedMoEQuantConfig()
+                logger.warning_once("MoE DP setup unable to determine "
+                                    "quantization scheme or unsupported "
+                                    "quantization type. This model will "
+                                    "not run with DP enabled.")
+        else:
+            _quant_config = quant_config
+
+        return FusedMoEConfig(
+            num_experts=num_experts,
+            experts_per_token=experts_per_token,
+            hidden_dim=hidden_dim,
+            num_local_experts=num_local_experts,
+            moe_parallel_config=moe_parallel_config,
+            in_dtype=in_dtype,
+            quant_config=_quant_config,
+            max_num_tokens=max_num_tokens,
+        )
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index 73d169a84808d..0ef4e4f767e3f 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -7,6 +7,7 @@ import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
     MoEPrepareAndFinalizeNoEP)
 from vllm.model_executor.layers.fused_moe.utils import _fp8_perm, _resize_cache
@@ -202,26 +203,47 @@ def run_cutlass_moe_fp8(
 
 
 # TODO (bnell): split class batched vs. non-batched?
+# maybe remove need for passing aq to workspace_shapes
 class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute):
 
     def __init__(
         self,
         max_experts_per_worker: int,
-        out_dtype: torch.dtype,
-        per_act_token: bool,
-        per_out_ch: bool,
+        out_dtype: Optional[torch.dtype],
+        per_act_token_quant: bool,
+        per_out_ch_quant: bool,
+        block_shape: Optional[list[int]] = None,
         use_batched_format: bool = False,
     ):
-        super().__init__()
+        super().__init__(
+            FusedMoEQuantConfig(
+                quant_dtype=torch.float8_e4m3fn,
+                per_act_token_quant=per_act_token_quant,
+                per_out_ch_quant=per_out_ch_quant,
+                block_shape=block_shape,
+            ))
+        assert max_experts_per_worker > 0
         self.max_experts_per_worker = max_experts_per_worker
         self.out_dtype = out_dtype
-        self.per_act_token = per_act_token
-        self.per_out_ch = per_out_ch
         self.use_batched_format = use_batched_format
 
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        if self.use_batched_format:
+            return (mk.FusedMoEActivationFormat.BatchedExperts,
+                    mk.FusedMoEActivationFormat.BatchedExperts)
+        else:
+            return (mk.FusedMoEActivationFormat.Standard,
+                    mk.FusedMoEActivationFormat.Standard)
+
     def supports_chunking(self) -> bool:
         return not self.use_batched_format
 
+    def supports_expert_map(self) -> bool:
+        return not self.use_batched_format
+
     def workspace_shapes(
         self,
         a: torch.Tensor,
@@ -245,7 +267,8 @@ class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute):
             workspace1 = (M * topk, max(2 * N, K))
             workspace2 = (M * topk, N)
             output = (M * topk, K)
-        return (workspace1, workspace2, output, self.out_dtype)
+        return (workspace1, workspace2, output,
+                self.out_dtype if self.out_dtype is not None else a.dtype)
 
     def apply(
         self,
@@ -270,13 +293,14 @@ class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute):
         assert w1_zp is None, "w1_zp is not supported in CUTLASS MoE"
         assert w2_zp is None, "w2_zp is not supported in CUTLASS MoE"
         activation_callable = lambda i, o: self.activation(activation, i, o)
-        run_cutlass_moe_fp8(output, hidden_states, w1, w2, topk_ids,
-                            activation_callable, global_num_experts,
-                            expert_map, w1_scale, w2_scale, a1q_scale,
-                            a2_scale, workspace13, workspace2,
-                            expert_num_tokens, self.out_dtype,
-                            self.per_act_token, self.per_out_ch,
-                            self.use_batched_format)
+        in_dtype = hidden_states.dtype
+        run_cutlass_moe_fp8(
+            output, hidden_states, w1, w2, topk_ids, activation_callable,
+            global_num_experts, expert_map, w1_scale, w2_scale, a1q_scale,
+            a2_scale, workspace13, workspace2, expert_num_tokens,
+            self.out_dtype if self.out_dtype is not None else in_dtype,
+            self.per_act_token_quant, self.per_out_ch_quant,
+            self.use_batched_format)
 
 
 def cutlass_moe_fp8(
@@ -287,6 +311,7 @@ def cutlass_moe_fp8(
     topk_ids: torch.Tensor,
     w1_scale: torch.Tensor,
     w2_scale: torch.Tensor,
+    per_act_token: bool,
     activation: str = "silu",
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
@@ -330,22 +355,18 @@ def cutlass_moe_fp8(
     Returns:
     - torch.Tensor: The fp16 output tensor after applying the MoE layer.
     """
-    per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
-        a2_scale.numel() != 1 if a2_scale is not None else False)
     per_out_ch = w1_scale.numel() != w1_q.size(0)
 
-    out_dtype = a.dtype
+    num_experts = global_num_experts if global_num_experts != -1 else w1_q.size(
+        0)
 
     fn = mk.FusedMoEModularKernel(
-        MoEPrepareAndFinalizeNoEP(
-            quant_dtype=torch.float8_e4m3fn,
-            per_channel_quant=per_act_token,
-        ),
+        MoEPrepareAndFinalizeNoEP(),
         CutlassExpertsFp8(
-            max_experts_per_worker=global_num_experts,
-            out_dtype=out_dtype,
-            per_act_token=per_act_token,
-            per_out_ch=per_out_ch,
+            max_experts_per_worker=num_experts,
+            out_dtype=a.dtype,
+            per_act_token_quant=per_act_token,
+            per_out_ch_quant=per_out_ch,
             use_batched_format=False,
         ),
     )
@@ -358,7 +379,7 @@ def cutlass_moe_fp8(
         topk_ids,
         False,
         activation,
-        global_num_experts if global_num_experts != -1 else w1_q.size(0),
+        num_experts,
         expert_map,
         w1_scale,
         w2_scale,
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index 818f6d345ba6d..8ad57c237feda 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -7,13 +7,13 @@ import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
     _moe_permute)
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
     MoEPrepareAndFinalizeNoEP)
-from vllm.model_executor.layers.fused_moe.utils import _resize_cache
-from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    per_token_group_quant_fp8)
+from vllm.model_executor.layers.fused_moe.utils import (
+    _resize_cache, per_token_group_quant_fp8)
 from vllm.utils import has_deep_gemm, round_up
 
 logger = init_logger(__name__)
@@ -65,16 +65,31 @@ def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor,
 class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     def __init__(self):
-        super().__init__()
-        self.block_shape = deep_gemm_block_shape()
+        super().__init__(
+            FusedMoEQuantConfig(
+                quant_dtype=torch.float8_e4m3fn,
+                per_act_token_quant=False,
+                block_shape=deep_gemm_block_shape(),
+            ))
+
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (mk.FusedMoEActivationFormat.Standard,
+                mk.FusedMoEActivationFormat.Standard)
 
     def supports_chunking(self) -> bool:
         return True
 
+    def supports_expert_map(self) -> bool:
+        return True
+
     def workspace_shapes(
         self, a: torch.Tensor, aq: torch.Tensor, M: int, N: int, K: int,
         topk: int, global_num_experts: int, local_num_experts: int
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        assert self.block_shape is not None
         # We use global_num_experts due to how moe_align_block_size handles
         # expert_maps.
         num_experts = global_num_experts
@@ -107,6 +122,7 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         expert_num_tokens: Optional[torch.Tensor],
     ):
         import deep_gemm as dg
+        assert self.block_shape is not None
 
         a1q = hidden_states
         _, N, K = w1.size()
@@ -213,8 +229,7 @@ def deep_gemm_moe_fp8(
     - torch.Tensor: The bfloat16 output tensor after applying the MoE layer.
     """
     fn = mk.FusedMoEModularKernel(
-        MoEPrepareAndFinalizeNoEP(quant_dtype=torch.float8_e4m3fn,
-                                  block_shape=deep_gemm_block_shape()),
+        MoEPrepareAndFinalizeNoEP(),
         DeepGemmExperts(),
     )
     return fn(
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
index 8c21d8aa53a64..d8ddec9554f00 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -6,6 +6,7 @@ import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.utils import (
     moe_kernel_quantize_input)
 
@@ -15,22 +16,14 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     Prepare/Finalize using DeepEP High-Throughput kernels.
     """
 
-    def __init__(self,
-                 buffer: deep_ep.Buffer,
-                 world_size: int,
-                 rank: int,
-                 dp_size: int,
-                 rank_expert_offset: int,
-                 quant_dtype: Optional[torch.dtype] = None,
-                 block_shape: Optional[list[int]] = None):
+    def __init__(self, buffer: deep_ep.Buffer, world_size: int, rank: int,
+                 dp_size: int, rank_expert_offset: int):
         super().__init__()
         self.buffer = buffer
         self.world_size = world_size
         self.rank = rank
         self.dp_size = dp_size
         self.rank_expert_offset = rank_expert_offset
-        self.quant_dtype = quant_dtype
-        self.block_shape = block_shape
         # The dispatch function returns a handle that the combine function
         # requires. We store the handle here so it is available to the
         # combine function.
@@ -39,6 +32,10 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         # From https://github.com/deepseek-ai/DeepEP/blob/9fe9021f29c9083cd1808ab36b740208524d9f63/deep_ep/buffer.py#L164
         self.available_rank_configs = [2, 4, 8, 16, 24, 32, 64, 128, 144, 160]
 
+    @property
+    def activation_format(self) -> mk.FusedMoEActivationFormat:
+        return mk.FusedMoEActivationFormat.Standard
+
     def max_num_tokens_per_rank(self) -> Optional[int]:
         return None
 
@@ -55,13 +52,6 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             return None
         return deep_ep.Buffer.get_combine_config(self.dp_size)
 
-    def _do_quant(self, tokens: torch.Tensor,
-                  token_scales: Optional[torch.Tensor], per_act_token: bool):
-        tokens, token_scales = moe_kernel_quantize_input(
-            tokens, token_scales, self.quant_dtype, per_act_token,
-            self.block_shape)
-        return tokens, token_scales
-
     def _do_dispatch(self, tokens: torch.Tensor,
                      token_scales: Optional[torch.Tensor],
                      rank_topk_ids: torch.Tensor,
@@ -130,43 +120,51 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         a1: torch.Tensor,
         a1_scale: Optional[torch.Tensor],
         a2_scale: Optional[torch.Tensor],
-        rank_topk_weights: torch.Tensor,
-        rank_topk_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
         num_experts: int,
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
+        quant_config: FusedMoEQuantConfig,
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
                Optional[torch.Tensor], Optional[torch.Tensor]]:
 
         if apply_router_weight_on_input:
-            topk = rank_topk_ids.size(1)
+            topk = topk_ids.size(1)
             # TODO: this only works for topK=1, will need to update for topK>1
             assert topk == 1, (
                 "apply_router_weight_on_input is only implemented for topk=1")
-            a1 = a1 * rank_topk_weights.to(a1.dtype)
+            a1 = a1 * topk_weights.to(a1.dtype)
 
         # Check if there is a block_shape / or if we can infer the quantization
         # schemes from the scales.
         per_token_quant = None
-        if all([x is None for x in [self.block_shape, a1_scale, a2_scale]
-                ]) and self.quant_dtype is not None:
+        if all([
+                x is None
+                for x in [quant_config.block_shape, a1_scale, a2_scale]
+        ]) and quant_config.quant_dtype is not None:
             # Quantization required despite none of the inputs suggesting
             # quantization. Fallback to per_token_dynamic quant.
             per_token_quant = True
         else:
-            per_token_quant = ((self.block_shape is not None) or
-                               (a1_scale is not None and a1_scale.numel() != 1)
-                               or (a2_scale is not None
-                                   and a2_scale.numel() != 1))
+            per_token_quant = False
 
         if per_token_quant:
-            a1q, a1q_scale = self._do_quant(a1, a1_scale, per_act_token=True)
+            a1q, a1q_scale = moe_kernel_quantize_input(
+                a1,
+                a1_scale,
+                quant_dtype=quant_config.quant_dtype,
+                per_act_token_quant=True,
+                block_shape=quant_config.block_shape,
+            )
+            if a1q_scale is not None and a1q_scale.numel() == 1:
+                a1q_scale = a1q_scale.view(1, 1)
             (expert_x, expert_x_scale, expert_num_tokens, expert_topk_ids,
              expert_topk_weights) = self._do_dispatch(
                  tokens=a1q,
                  token_scales=a1q_scale,
-                 rank_topk_ids=rank_topk_ids,
-                 rank_topk_weights=rank_topk_weights,
+                 rank_topk_ids=topk_ids,
+                 rank_topk_weights=topk_weights,
                  num_experts=num_experts)
         else:
             # DeepEP kernels only support dispatching per-token-quant
@@ -175,15 +173,18 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
              expert_topk_weights) = self._do_dispatch(
                  tokens=a1,
                  token_scales=None,
-                 rank_topk_ids=rank_topk_ids,
-                 rank_topk_weights=rank_topk_weights,
+                 rank_topk_ids=topk_ids,
+                 rank_topk_weights=topk_weights,
                  num_experts=num_experts)
             # quantize now
             expert_x_scale = None
             if expert_x.numel() != 0:
-                expert_x, expert_x_scale = self._do_quant(expert_x,
-                                                          a1_scale,
-                                                          per_act_token=False)
+                expert_x, expert_x_scale = moe_kernel_quantize_input(
+                    expert_x,
+                    a1_scale,
+                    quant_dtype=quant_config.quant_dtype,
+                    per_act_token_quant=False,
+                    block_shape=quant_config.block_shape)
 
         return (expert_x, expert_x_scale, expert_num_tokens, expert_topk_ids,
                 expert_topk_weights)
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
index 5a8accd80463d..b315b4a97f04c 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -5,11 +5,13 @@ import deep_ep
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.utils import (
-    moe_kernel_quantize_input)
+    maybe_fix_scales, moe_kernel_quantize_input)
 
 # DeepEP kernels quantize dispatch inputs in 128 element chunks.
 DEEPEP_QUANT_BLOCK_SIZE = 128
+DEEPEP_QUANT_BLOCK_SHAPE = [DEEPEP_QUANT_BLOCK_SIZE, DEEPEP_QUANT_BLOCK_SIZE]
 
 
 def dequant_fp8(expert_x_fp8: torch.Tensor,
@@ -35,30 +37,30 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
     # DeepEP low-latency kernels are compiled only for certain
     # specific hidden sizes.
-    SUPPORTED_HIDDEN_SIZES = [2560, 4096, 5120, 7168]
+    SUPPORTED_HIDDEN_SIZES = [2048, 2560, 4096, 5120, 7168]
 
     def __init__(self,
                  buffer: deep_ep.Buffer,
+                 max_tokens_per_rank: int,
                  world_size: int,
                  dp_size: int,
-                 max_tokens_per_rank: int,
-                 quant_dtype: Optional[torch.dtype] = None,
-                 block_shape: Optional[list[int]] = None,
                  use_fp8_dispatch: bool = False):
         super().__init__()
 
         self.buffer = buffer
+        self.max_tokens_per_rank = max_tokens_per_rank
         self.world_size = world_size
         self.dp_size = dp_size
-        self.quant_dtype = quant_dtype
-        self.block_shape = block_shape
-        self.max_tokens_per_rank = max_tokens_per_rank
         self.use_fp8_dispatch = use_fp8_dispatch
         # The dispatch function returns a handle that the combine function
         # requires. We store the handle here so it is available to the
         # combine function.
         self.handle = None
 
+    @property
+    def activation_format(self) -> mk.FusedMoEActivationFormat:
+        return mk.FusedMoEActivationFormat.BatchedExperts
+
     def max_num_tokens_per_rank(self) -> Optional[int]:
         return self.max_tokens_per_rank
 
@@ -66,12 +68,17 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         return torch.int64
 
     def _do_quant(
-            self, x: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
-            a1_scale: Optional[torch.Tensor], a2_scale: Optional[torch.Tensor],
-            a1_dtype: torch.dtype
+        self,
+        x: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        a1_dtype: torch.dtype,
+        quant_dtype: Optional[torch.dtype],
+        per_act_token_quant: bool,
+        block_shape: Optional[list[int]],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
 
-        block_k = self.block_shape[1] if self.block_shape is not None else None
+        block_k = block_shape[1] if block_shape is not None else None
         if self.use_fp8_dispatch:
             if block_k == DEEPEP_QUANT_BLOCK_SIZE:
                 # DeepEP kernels did the quantization for us.
@@ -84,32 +91,20 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
         assert isinstance(x, torch.Tensor)
 
-        # Check if there is a block_shape / or if we can infer the quantization
-        # schemes from the scales.
-        per_token_quant = None
-        if all([v is None for v in [self.block_shape, a1_scale, a2_scale]
-                ]) and self.quant_dtype is not None:
-            # Quantization required despite none of the inputs suggesting
-            # quantization. Fallback to per_token_dynamic quant.
-            per_token_quant = True
-        else:
-            per_token_quant = ((self.block_shape is not None) or
-                               (a1_scale is not None and a1_scale.numel() != 1)
-                               or (a2_scale is not None
-                                   and a2_scale.numel() != 1))
+        assert not per_act_token_quant
 
         num_experts, max_tokens, hidden_dim = x.size()
 
         # TODO (varun): Optimization - Use a batched version of quant
         x = x.view((-1, hidden_dim))
-        x, x_scales = moe_kernel_quantize_input(x, a1_scale, self.quant_dtype,
-                                                per_token_quant,
-                                                self.block_shape)
+        x, x_scales = moe_kernel_quantize_input(x, a1_scale, quant_dtype,
+                                                per_act_token_quant,
+                                                block_shape)
         x = x.view((num_experts, -1, hidden_dim))
 
-        if per_token_quant:
+        if quant_dtype is not None:
             assert x_scales is not None
-            x_scales = x_scales.view(num_experts, max_tokens, -1)
+            x_scales = maybe_fix_scales(x_scales, num_experts)
 
         return x, x_scales
 
@@ -118,11 +113,12 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         a1: torch.Tensor,
         a1_scale: Optional[torch.Tensor],
         a2_scale: Optional[torch.Tensor],
-        rank_topk_weights: torch.Tensor,
-        rank_topk_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
         num_experts: int,
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
+        quant_config: FusedMoEQuantConfig,
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
                Optional[torch.Tensor], Optional[torch.Tensor]]:
 
@@ -142,24 +138,25 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             "low_latency kernels doesn't support dispatching per-token scales")
 
         if apply_router_weight_on_input:
-            topk = rank_topk_ids.size(1)
+            topk = topk_ids.size(1)
             # TODO: this only works for topK=1, will need to update for topK>1
             assert topk == 1, (
                 "apply_router_weight_on_input is only implemented for topk=1")
-            a1 = a1 * rank_topk_weights.to(a1.dtype)
+            a1 = a1 * topk_weights.to(a1.dtype)
 
         # Dispatch
         expert_x, expert_num_tokens, self.handle, event, hook = \
                 self.buffer.low_latency_dispatch(a1,
-                                                rank_topk_ids,
+                                                topk_ids,
                                                 self.max_tokens_per_rank,
                                                 num_experts,
                                                 use_fp8=self.use_fp8_dispatch,
                                                 async_finish=False,
                                                 return_recv_hook=False)
 
-        expert_x, expert_x_scale = self._do_quant(expert_x, a1_scale, a2_scale,
-                                                  a1.dtype)
+        expert_x, expert_x_scale = self._do_quant(
+            expert_x, a1_scale, a2_scale, a1.dtype, quant_config.quant_dtype,
+            quant_config.per_act_token_quant, quant_config.block_shape)
 
         return (expert_x, expert_x_scale, expert_num_tokens, None, None)
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
index a12cfafd42ab6..37a109857ac3d 100644
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -8,6 +8,7 @@ import triton
 import triton.language as tl
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     get_config_dtype_str, try_get_optimal_moe_config)
 from vllm.model_executor.layers.fused_moe.utils import (
@@ -317,8 +318,8 @@ def invoke_moe_batched_triton_kernel(
         expert_num_tokens: torch.Tensor,  # [E]
         compute_type: tl.dtype,
         # Quantization data
-        A_scale: torch.Tensor,
-        B_scale: torch.Tensor,
+        A_scale: Optional[torch.Tensor],
+        B_scale: Optional[torch.Tensor],
         B_zp: torch.Tensor,
         # Quantization schemes
         use_fp8_w8a8: bool,
@@ -387,14 +388,23 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     that the PPLX dispatch/combine kernels use.
     """
 
-    def __init__(self, max_num_tokens: int, world_size: int, dp_size: int,
-                 rank: int):
+    def __init__(
+        self,
+        max_num_tokens: int,
+        world_size: int,
+        dp_size: int,
+        rank: int,
+    ):
         super().__init__()
         self.world_size = world_size
         self.dp_size = dp_size
         self.rank = rank
         self.max_num_tokens = max_num_tokens
 
+    @property
+    def activation_format(self) -> mk.FusedMoEActivationFormat:
+        return mk.FusedMoEActivationFormat.BatchedExperts
+
     def max_num_tokens_per_rank(self) -> Optional[int]:
         return self.max_num_tokens
 
@@ -411,6 +421,7 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         num_experts: int,
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
+        quant_config: FusedMoEQuantConfig,
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
                Optional[torch.Tensor], Optional[torch.Tensor]]:
         assert a1.dim() == 2
@@ -435,22 +446,35 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
         num_local_experts = num_experts // self.world_size
 
+        if quant_config.quant_dtype is None:
+            b_type = a1.dtype
+        else:
+            b_type = quant_config.quant_dtype
+
         b_a1 = torch.zeros(
             (num_local_experts, self.max_num_tokens, hidden_dim),
-            dtype=a1.dtype,
+            dtype=b_type,
             device=a1.device)
 
+        b_a1_scale = None
+
+        assert quant_config.quant_dtype is None, "quantization NYI"
+
         first_expert = num_local_experts * self.rank
         last_expert = first_expert + num_local_experts
 
         for expert_id in range(first_expert, last_expert):
             topks = torch.any(topk_ids == expert_id, dim=1).flatten()
             rows = torch.count_nonzero(topks.flatten())
-            b_a1[expert_id -
-                 first_expert, :rows, :] = a1[:topks.numel()][topks]
-            tokens_per_expert[expert_id - first_expert] = rows
+            if rows == 0:
+                continue
+            idx = expert_id - first_expert
+            b_a1[idx, :rows, :] = a1[:topks.numel()][topks]
+            tokens_per_expert[idx] = rows
 
-        return b_a1, a1_scale, tokens_per_expert, None, None
+        assert b_a1_scale is None or b_a1_scale.ndim == 3
+
+        return b_a1, b_a1_scale, tokens_per_expert, None, None
 
     def finalize(
         self,
@@ -480,7 +504,7 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             output[topks] = output[topks] + rhs
 
 
-class BatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
+class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
     """
     A reference MoE expert class that operates on expert batched format,
     i.e. E x max_num_tokens x K.  This is the format that the pplx
@@ -497,11 +521,17 @@ class BatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
         use_int8_w8a16: bool = False,
         use_int4_w4a16: bool = False,
         block_shape: Optional[list[int]] = None,
-        block_m: Optional[int] = None,
+        per_act_token_quant: bool = False,
     ):
-        super().__init__()
-        assert block_shape is None
-        assert block_m is None
+        super().__init__(
+            FusedMoEQuantConfig.make(
+                use_fp8_w8a8=use_fp8_w8a8,
+                use_int8_w8a8=use_int8_w8a8,
+                use_int8_w8a16=use_int8_w8a16,
+                use_int4_w4a16=use_int4_w4a16,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+            ))
         assert not use_fp8_w8a8, "NYI"
         assert not use_int8_w8a8, "NYI"
         assert not use_int8_w8a16, "NYI"
@@ -510,9 +540,19 @@ class BatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
         self.world_size = world_size
         self.dp_size = dp_size
 
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (mk.FusedMoEActivationFormat.BatchedExperts,
+                mk.FusedMoEActivationFormat.BatchedExperts)
+
     def supports_chunking(self) -> bool:
         return False
 
+    def supports_expert_map(self) -> bool:
+        return False
+
     def workspace_shapes(
         self,
         a: torch.Tensor,
@@ -554,20 +594,12 @@ class BatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
         assert hidden_states.dim() == 3
         assert expert_num_tokens is not None
 
-        max_num_tokens = self.max_num_tokens
-        num_dp = self.world_size // self.dp_size
         num_local_experts = w1.size(0)
         assert num_local_experts == w1.size(0), (
             f"{num_local_experts} == {w1.size(0)}")
 
         N = w1.size(1) // 2
 
-        # Not cudagraph friendly
-        assert (torch.compiler.is_compiling()
-                or torch.cuda.is_current_stream_capturing()
-                or torch.all(expert_num_tokens <= max_num_tokens * num_dp)), (
-                    f"{expert_num_tokens} <= {max_num_tokens * num_dp}")
-
         for expert in range(num_local_experts):
             # Indexing expert_num_tokens doesn't work w/cudagraphs or inductor
             if (torch.compiler.is_compiling()
@@ -575,6 +607,10 @@ class BatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
                 num = hidden_states.shape[1]
             else:
                 num = int(expert_num_tokens[expert].item())
+
+            if num == 0:
+                continue
+
             tmp = _resize_cache(workspace2, (num, N))
             input = hidden_states[expert, :num, :] @ w1[expert].transpose(0, 1)
             self.activation(activation, tmp, input)
@@ -590,34 +626,53 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     def __init__(
         self,
-        max_num_tokens: Optional[int] = None,
+        max_num_tokens: int,
+        world_size: int,
+        dp_size: int,
         use_fp8_w8a8: bool = False,
         use_int8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
         use_int4_w4a16: bool = False,
-        per_channel_quant: bool = False,
+        per_act_token_quant: bool = False,
         block_shape: Optional[list[int]] = None,
-        world_size: int = 1,
-        dp_size: int = 1,
     ):
-        super().__init__()
+        super().__init__(
+            FusedMoEQuantConfig.make(
+                use_fp8_w8a8=use_fp8_w8a8,
+                use_int8_w8a8=use_int8_w8a8,
+                use_int8_w8a16=use_int8_w8a16,
+                use_int4_w4a16=use_int4_w4a16,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+            ))
+        assert not use_int8_w8a8, "NYI"
+        assert not use_int8_w8a16, "NYI"
+        assert not use_int4_w4a16, "NYI"
         self.use_fp8_w8a8 = use_fp8_w8a8
         self.use_int8_w8a8 = use_int8_w8a8
         self.use_int4_w4a16 = use_int4_w4a16
         self.use_int8_w8a16 = use_int8_w8a16
-        self.block_shape = block_shape
-        self.per_channel_quant = per_channel_quant
         self.max_num_tokens = max_num_tokens
         self.world_size = world_size
         self.dp_size = dp_size
+        assert world_size > 0
+        assert dp_size > 0
+        assert dp_size <= world_size
+        assert max_num_tokens > 0
 
-        assert not use_int8_w8a8, "NYI"
-        assert not use_int4_w4a16, "NYI"
-        assert self.block_shape is None, "NYI"
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (mk.FusedMoEActivationFormat.BatchedExperts,
+                mk.FusedMoEActivationFormat.BatchedExperts)
 
     def supports_chunking(self) -> bool:
         return False
 
+    def supports_expert_map(self) -> bool:
+        return False
+
     def workspace_shapes(
         self,
         a: torch.Tensor,
@@ -630,10 +685,9 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         local_num_experts: int,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
         assert a.dim() == 2
-        num_dp = self.world_size // self.dp_size
+        num_dp = self.world_size
         num_experts = local_num_experts
-        max_num_tokens = a.size(
-            0) if self.max_num_tokens is None else self.max_num_tokens
+        max_num_tokens = self.max_num_tokens
         workspace13 = (num_experts, max_num_tokens * num_dp, max(K, N))
         workspace2 = (num_experts, max_num_tokens * num_dp, (N // 2))
         output = (num_experts, max_num_tokens * num_dp, K)
@@ -708,7 +762,6 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
             raise ValueError(
                 f"Unsupported compute_type: {hidden_states.dtype}")
 
-        #print(f"shape: E={E}, M={num_tokens}, N={N}, K={K}, top_k={top_k_num}")
         # We can reuse the memory between these because by the time we need
         # cache3, we're done with cache1
         intermediate_cache1 = _resize_cache(workspace13,
@@ -734,6 +787,8 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
                                          config=config,
                                          block_shape=self.block_shape)
 
+        intermediate_cache2.fill_(0)
+
         # TODO: would be nice to use expert_num_tokens here to reduce
         # garbage compute
         self.activation(activation, intermediate_cache2.view(-1, N // 2),
@@ -745,8 +800,8 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
             A=intermediate_cache2,
             A_scale=a2_scale,
-            qtype=torch.float8_e4m3fn if self.use_fp8_w8a8 else None,
-            per_channel_quant=self.per_channel_quant,
+            quant_dtype=self.quant_dtype,
+            per_act_token_quant=self.per_act_token_quant,
             block_shape=self.block_shape)
 
         qintermediate_cache2 = qintermediate_cache2.view(
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index f22884b8a1a5a..75712b8e3a4d6 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -12,6 +12,10 @@ import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
+# yapf: disable
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEQuantConfig, get_config_quant_dtype)
+# yapf: enable
 from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
     _valid_deep_gemm, deep_gemm_moe_fp8)
 from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
@@ -980,20 +984,6 @@ def get_config_dtype_str(
     return None
 
 
-# TODO (bnell): use scalar_type instead of bools?
-def get_config_qtype(
-    use_fp8_w8a8: bool,
-    use_int8_w8a8: bool,
-    use_int8_w8a16: bool,
-    use_int4_w4a16: bool,
-) -> Optional[torch.dtype]:
-    if use_fp8_w8a8:
-        return torch.float8_e4m3fn
-    elif use_int8_w8a8:
-        return torch.int8
-    return None
-
-
 def inplace_fused_experts(hidden_states: torch.Tensor,
                           w1: torch.Tensor,
                           w2: torch.Tensor,
@@ -1262,10 +1252,10 @@ def fused_experts_impl(
                                         use_int4_w4a16=use_int4_w4a16,
                                         dtype=hidden_states.dtype)
 
-    qtype = get_config_qtype(use_fp8_w8a8=use_fp8_w8a8,
-                             use_int8_w8a8=use_int8_w8a8,
-                             use_int8_w8a16=use_int8_w8a16,
-                             use_int4_w4a16=use_int4_w4a16)
+    qtype = get_config_quant_dtype(use_fp8_w8a8=use_fp8_w8a8,
+                                   use_int8_w8a8=use_int8_w8a8,
+                                   use_int8_w8a16=use_int8_w8a16,
+                                   use_int4_w4a16=use_int4_w4a16)
 
     get_config_func = functools.partial(
         try_get_optimal_moe_config,
@@ -1332,8 +1322,8 @@ def fused_experts_impl(
         qcurr_hidden_states, a1q_scale = moe_kernel_quantize_input(
             A=curr_hidden_states,
             A_scale=a1_scale,
-            qtype=qtype,
-            per_channel_quant=per_channel_quant,
+            quant_dtype=qtype,
+            per_act_token_quant=per_channel_quant,
             block_shape=block_shape)
 
         sorted_token_ids, expert_ids, num_tokens_post_padded = (
@@ -1373,8 +1363,8 @@ def fused_experts_impl(
         qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
             A=intermediate_cache2,
             A_scale=a2_scale,
-            qtype=qtype,
-            per_channel_quant=per_channel_quant,
+            quant_dtype=qtype,
+            per_act_token_quant=per_channel_quant,
             block_shape=block_shape)
 
         invoke_fused_moe_kernel(qintermediate_cache2,
@@ -1521,30 +1511,41 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     def __init__(
         self,
-        use_fp8_w8a8: bool,
-        use_int8_w8a8: bool,
-        use_int8_w8a16: bool,
-        use_int4_w4a16: bool,
-        per_channel_quant: bool,
+        use_fp8_w8a8: bool = False,
+        use_int8_w8a8: bool = False,
+        use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
+        per_act_token_quant: bool = False,
         block_shape: Optional[list[int]] = None,
-        block_m: Optional[int] = None,
     ):
-        super().__init__()
+        super().__init__(
+            FusedMoEQuantConfig.make(
+                use_fp8_w8a8=use_fp8_w8a8,
+                use_int8_w8a8=use_int8_w8a8,
+                use_int8_w8a16=use_int8_w8a16,
+                use_int4_w4a16=use_int4_w4a16,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+            ))
+
         self.use_fp8_w8a8 = use_fp8_w8a8
         self.use_int4_w4a16 = use_int4_w4a16
         self.use_int8_w8a8 = use_int8_w8a8
         self.use_int8_w8a16 = use_int8_w8a16
-        self.block_shape = block_shape
-        self.block_m = block_m
-        self.qtype = get_config_qtype(use_fp8_w8a8=use_fp8_w8a8,
-                                      use_int8_w8a8=use_int8_w8a8,
-                                      use_int8_w8a16=use_int8_w8a16,
-                                      use_int4_w4a16=use_int4_w4a16)
-        self.per_channel_quant = per_channel_quant
+
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (mk.FusedMoEActivationFormat.Standard,
+                mk.FusedMoEActivationFormat.Standard)
 
     def supports_chunking(self) -> bool:
         return True
 
+    def supports_expert_map(self) -> bool:
+        return True
+
     def workspace_shapes(
         self,
         a: torch.Tensor,
@@ -1660,7 +1661,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
                                 use_int8_w8a8=self.use_int8_w8a8,
                                 use_int8_w8a16=self.use_int8_w8a16,
                                 use_int4_w4a16=self.use_int4_w4a16,
-                                per_channel_quant=self.per_channel_quant,
+                                per_channel_quant=self.per_act_token_quant,
                                 block_shape=self.block_shape)
 
         self.activation(activation, intermediate_cache2,
@@ -1669,8 +1670,8 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         a2q_scale: Optional[torch.Tensor] = None
 
         qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
-            intermediate_cache2, a2_scale, self.qtype, self.per_channel_quant,
-            self.block_shape)
+            intermediate_cache2, a2_scale, self.quant_dtype,
+            self.per_act_token_quant, self.block_shape)
 
         invoke_fused_moe_kernel(qintermediate_cache2,
                                 w2,
@@ -1690,7 +1691,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
                                 use_int8_w8a8=self.use_int8_w8a8,
                                 use_int8_w8a16=self.use_int8_w8a16,
                                 use_int4_w4a16=self.use_int4_w4a16,
-                                per_channel_quant=self.per_channel_quant,
+                                per_channel_quant=self.per_act_token_quant,
                                 block_shape=self.block_shape)
 
 
@@ -1699,27 +1700,17 @@ def modular_triton_fused_moe(
     use_int8_w8a8: bool,
     use_int8_w8a16: bool,
     use_int4_w4a16: bool,
-    per_channel_quant: bool,
+    per_act_token_quant: bool,
     block_shape: Optional[list[int]] = None,
 ) -> mk.FusedMoEModularKernel:
-    qtype = get_config_qtype(
-        use_fp8_w8a8=use_fp8_w8a8,
-        use_int8_w8a8=use_int8_w8a8,
-        use_int8_w8a16=use_int8_w8a16,
-        use_int4_w4a16=use_int4_w4a16,
-    )
     return mk.FusedMoEModularKernel(
-        MoEPrepareAndFinalizeNoEP(
-            quant_dtype=qtype,
-            per_channel_quant=per_channel_quant,
-            block_shape=block_shape,
-        ),
+        MoEPrepareAndFinalizeNoEP(),
         TritonExperts(
             use_fp8_w8a8=use_fp8_w8a8,
             use_int8_w8a8=use_int8_w8a8,
             use_int8_w8a16=use_int8_w8a16,
             use_int4_w4a16=use_int4_w4a16,
-            per_channel_quant=per_channel_quant,
+            per_act_token_quant=per_act_token_quant,
             block_shape=block_shape,
         ),
     )
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 65a46ba5554bd..6f97702628569 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -3,27 +3,30 @@
 
 from abc import abstractmethod
 from collections.abc import Iterable
-from dataclasses import dataclass
 from enum import Enum
-from typing import Callable, Literal, Optional, Union, overload
+from typing import Callable, Literal, Optional, overload
 
 import torch
 import torch.nn.functional as F
-from compressed_tensors.quantization import (QuantizationArgs,
-                                             QuantizationStrategy,
-                                             QuantizationType)
 from torch.nn.parameter import UninitializedParameter
 
 import vllm.envs as envs
-from vllm.config import ParallelConfig, get_current_vllm_config
+from vllm.config import get_current_vllm_config
 from vllm.distributed import (get_dp_group, get_ep_group,
-                              get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
+                              get_world_group,
                               tensor_model_parallel_all_reduce)
 from vllm.distributed.eplb.eplb_state import EplbState
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
+# yapf: disable
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig, FusedMoEParallelConfig)
+# yapf: enable
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEActivationFormat, FusedMoEModularKernel,
+    FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize)
 from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
     is_rocm_aiter_moe_enabled)
 from vllm.model_executor.layers.quantization.base_config import (
@@ -36,14 +39,12 @@ from vllm.utils import direct_register_custom_op, has_deep_ep, has_pplx
 if current_platform.is_cuda_alike():
     from .fused_batched_moe import BatchedTritonExperts
     from .fused_moe import TritonExperts, fused_experts
-    from .modular_kernel import (FusedMoEModularKernel,
-                                 FusedMoEPermuteExpertsUnpermute,
-                                 FusedMoEPrepareAndFinalize)
     if has_pplx():
-        from .pplx_prepare_finalize import PplxPrepareAndFinalize
+        from .pplx_prepare_finalize import (PplxPrepareAndFinalize,
+                                            pplx_hidden_dim_scale_bytes)
     if has_deep_ep():
         from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
-        from .deepep_ll_prepare_finalize import (DEEPEP_QUANT_BLOCK_SIZE,
+        from .deepep_ll_prepare_finalize import (DEEPEP_QUANT_BLOCK_SHAPE,
                                                  DeepEPLLPrepareAndFinalize)
 else:
     fused_experts = None  # type: ignore
@@ -60,209 +61,10 @@ if current_platform.is_tpu():
     from .moe_pallas import fused_moe as fused_moe_pallas
 else:
     fused_moe_pallas = None  # type: ignore
+
 logger = init_logger(__name__)
 
 
-@dataclass
-class FusedMoEParallelConfig:
-    tp_size: int
-    dp_size: int
-    ep_size: int
-    tp_rank: int
-    dp_rank: int
-    ep_rank: int
-
-    use_ep: bool  # whether to use EP or not
-
-    @property
-    def use_all2all_kernels(self):
-        return self.dp_size > 1 and self.use_ep
-
-    @property
-    def use_pplx_kernels(self):
-        return (self.use_all2all_kernels
-                and envs.VLLM_ALL2ALL_BACKEND == "pplx")
-
-    @property
-    def use_deepep_ht_kernels(self):
-        return (self.use_all2all_kernels
-                and envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput")
-
-    @property
-    def use_deepep_ll_kernels(self):
-        return (self.use_all2all_kernels
-                and envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency")
-
-    @staticmethod
-    def make(tp_size_: int, dp_size_: int,
-             vllm_parallel_config: ParallelConfig) -> "FusedMoEParallelConfig":
-        """
-        Determine MoE parallel configuration. Based on the input tp_size_,
-        dp_size_, ep_size_ and vllm's parallel config, determine what
-        level's of parallelism to use in the fused moe layer.
-
-        Args:
-            tp_size_ (int): tp_size passed into the FusedMoE constructor.
-            dp_size_ (int): dp_size passed into the FusedMoE constructor.
-            ep_size_ (int): ep_size passed into the FusedMoE constructor.
-            vllm_parallel_config (ParallelConfig): vllm's parallel config
-            object.
-
-        Examples:
-        When there is no parallelism requested, i.e. tp_size_ = dp_size_ = 1,
-        we simply return the sizes unaltered and the ranks set to 0.
-
-        Expert Parallelism is considered only when either dp_size_ or tp_size_
-        is non trivial.
-
-        When TP = 2, DP = 1 and EP = False, the configuration on different
-        devices,
-            - device 0 : TP = {2, 0} DP = {1, 0} EP = {1, 0} //
-                         legend : {size, rank}
-            - device 1 : TP = {2, 1} DP = {1, 0} EP = {1, 0}
-            - Comment : Tensors are sharded across 2 devices.
-
-        When TP = 1, DP = 2 and EP = False, the configuration on different
-        devices,
-            - device 0 : TP = {2, 0} DP = {2, 0} EP = {1, 0}
-            - device 1 : TP = {2, 1} DP = {2, 1} EP = {1, 0}
-            - Comment: There are 2 engine instances and the tensors are sharded
-              across 2 decvices.
-
-        When TP = 2, DP = 2 and EP = False, the configuration on different
-        devices,
-            - device 0: TP = {4, 0} DP = {2, 0} EP = {1, 0}
-            - device 1: TP = {4, 1} DP = {2, 0} EP = {1, 0}
-            - device 2: TP = {4, 2} DP = {2, 1} EP = {1, 0}
-            - device 3: TP = {4, 3} DP = {2, 1} EP = {1, 0}
-            - Comment: There are 2 engine instances and the tensors are sharded
-              across 4 devices.
-
-        When, TP = 2, DP = 1 and EP = True, the configuration on different
-        devices,
-            - device 0: TP = {1, 0} DP = {1, 0} EP = {2, 0}
-            - device 1: TP = {1, 0} DP = {1, 0} EP = {2, 1}
-            - Comment: The experts are split between the 2 devices.
-
-        When, TP = 1, DP = 2 and EP = True, the configuration on different
-        devices,
-            - device 0: TP = {1, 0} DP = {2, 0} EP = {2, 0}
-            - device 1: TP = {1, 0} DP = {2, 1} EP = {2, 1}
-            - Comment: There are 2 engine instances and the experts are split
-              between the 2 devices.
-
-        When TP = 2, DP = 2 and EP = True, the configuration on different
-        devices,
-            - device 0: TP = {1, 0} DP = {2, 0} EP = {4, 0}
-            - device 1: TP = {1, 0} DP = {2, 0} EP = {4, 1}
-            - device 2: TP = {1, 0} DP = {2, 1} EP = {4, 2}
-            - device 3: TP = {1, 0} DP = {2, 1} EP = {4, 3}
-            - Comment: There are 2 engine instances and the experts are split
-              between the 4 devices.
-        """
-
-        def flatten_tp_across_dp(dp_rank: int):
-            tp_rank = 0 if tp_size_ == 1 else get_tensor_model_parallel_rank()
-            # There are actually dp_size_ * tp_size_ devices. Update tp_size
-            # and tp_rank so we shard across all devices.
-            tp_size = dp_size_ * tp_size_
-            tp_rank = dp_rank * tp_size_ + tp_rank
-            return tp_size, tp_rank
-
-        use_ep = (dp_size_ * tp_size_ > 1
-                  and vllm_parallel_config.enable_expert_parallel)
-
-        dp_size = dp_size_
-        dp_rank = get_dp_group().rank_in_group if dp_size > 1 else 0
-        tp_size, tp_rank = flatten_tp_across_dp(dp_rank)
-
-        if not use_ep:
-            return FusedMoEParallelConfig(tp_size=tp_size,
-                                          tp_rank=tp_rank,
-                                          dp_size=dp_size,
-                                          dp_rank=dp_rank,
-                                          ep_size=1,
-                                          ep_rank=0,
-                                          use_ep=False)
-        # DP + EP / TP + EP / DP + TP + EP
-        assert use_ep
-        # In EP, each device owns a set of experts fully. There is no tensor
-        # parallel update tp_size, tp_rank, ep_size and ep_rank to reflect that.
-        ep_size = tp_size
-        ep_rank = tp_rank
-        return FusedMoEParallelConfig(tp_size=1,
-                                      tp_rank=0,
-                                      dp_size=dp_size,
-                                      dp_rank=dp_rank,
-                                      ep_size=ep_size,
-                                      ep_rank=ep_rank,
-                                      use_ep=True)
-
-
-# Adapted from pplx-kernels tests/all_to_all_utils.py
-@dataclass
-class MoEConfig:
-    num_experts: int
-    experts_per_token: int
-    hidden_dim: int
-
-    num_local_experts: int
-    moe_parallel_config: FusedMoEParallelConfig
-
-    in_dtype: torch.dtype  # The activation type.
-    quant_dtype: torch.dtype = None
-
-    # TODO: add more quantization params, blocked, per-token, etc.
-    block_size: int = 128
-
-    max_num_tokens: int = envs.VLLM_MOE_DP_CHUNK_SIZE
-
-    def __post_init__(self):
-        if self.dp_size > 1:
-            logger.debug("Using MOEConfig::max_num_tokens=%d",
-                         self.max_num_tokens)
-
-    @property
-    def tp_size(self):
-        return self.moe_parallel_config.tp_size
-
-    @property
-    def dp_size(self):
-        return self.moe_parallel_config.dp_size
-
-    @property
-    def ep_size(self):
-        return self.moe_parallel_config.ep_size
-
-    @property
-    def tp_rank(self):
-        return self.moe_parallel_config.tp_rank
-
-    @property
-    def dp_rank(self):
-        return self.moe_parallel_config.dp_rank
-
-    @property
-    def ep_rank(self):
-        return self.moe_parallel_config.ep_rank
-
-    @property
-    def use_ep(self):
-        return self.moe_parallel_config.use_ep
-
-    @property
-    def use_pplx_kernels(self):
-        return self.moe_parallel_config.use_pplx_kernels
-
-    @property
-    def use_deepep_ht_kernels(self):
-        return self.moe_parallel_config.use_deepep_ht_kernels
-
-    @property
-    def use_deepep_ll_kernels(self):
-        return self.moe_parallel_config.use_deepep_ll_kernels
-
-
 class FusedMoeWeightScaleSupported(Enum):
     TENSOR = "tensor"
     CHANNEL = "channel"
@@ -270,21 +72,9 @@ class FusedMoeWeightScaleSupported(Enum):
     BLOCK = "block"
 
 
-def get_quant_config_input_activations(
-        quant_config: Optional[QuantizationConfig]
-) -> Optional[QuantizationArgs]:
-    if (quant_config is not None and hasattr(quant_config, 'target_scheme_map')
-            and "Linear" in quant_config.target_scheme_map and
-            "input_activations" in quant_config.target_scheme_map["Linear"]):
-        return quant_config.target_scheme_map["Linear"].get(
-            "input_activations")
-    else:
-        return None
-
-
 class FusedMoEMethodBase(QuantizeMethodBase):
 
-    moe: MoEConfig
+    moe: FusedMoEConfig
 
     @abstractmethod
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
@@ -292,23 +82,25 @@ class FusedMoEMethodBase(QuantizeMethodBase):
                        params_dtype: torch.dtype, **extra_weight_attrs):
         raise NotImplementedError
 
-    def init_prepare_finalize(self, moe: MoEConfig,
+    def init_prepare_finalize(self, moe: FusedMoEConfig,
                               quant_config: Optional[QuantizationConfig]):
         all2all_manager = get_ep_group().device_communicator.all2all_manager
         assert all2all_manager is not None
 
         self.moe = moe
-        quant_dtype = None
-        act_quant_block_size = None
-        from vllm.model_executor.layers.quantization.fp8 import Fp8Config
-        if isinstance(quant_config, Fp8Config):
-            act_quant_block_size = quant_config.weight_block_size
-            quant_dtype = torch.float8_e4m3fn
 
-        prepare_finalize: Optional[Union[PplxPrepareAndFinalize,
-                                         DeepEPHTPrepareAndFinalize,
-                                         DeepEPLLPrepareAndFinalize]] = None
+        prepare_finalize: Optional[FusedMoEPrepareAndFinalize] = None
+
         if moe.use_pplx_kernels:
+            hidden_dim_bytes, hidden_scale_bytes = pplx_hidden_dim_scale_bytes(
+                moe.max_num_tokens,
+                moe.hidden_dim,
+                moe.in_dtype,
+                moe.quant_dtype,
+                per_act_token_quant=moe.per_act_token_quant,
+                block_shape=moe.block_shape,
+            )
+
             all_to_all_args = dict(
                 max_num_tokens=moe.max_num_tokens,
                 num_experts=moe.num_experts,
@@ -318,14 +110,8 @@ class FusedMoEMethodBase(QuantizeMethodBase):
                 # dp_size actually means tp_size, bug in pplx kernels
                 dp_size=all2all_manager.tp_group.world_size,
                 hidden_dim=moe.hidden_dim,
-                hidden_dim_bytes=moe.hidden_dim * moe.quant_dtype.itemsize,
-                # For blocked per token: set to
-                #   ceil_div(hidden_dim, block_size) * sizeof(float32)
-                # For per-token: set to sizeof(float32)
-                hidden_dim_scale_bytes=(
-                    0 if moe.quant_dtype.itemsize != 1 else
-                    ((moe.hidden_dim + moe.block_size - 1) // moe.block_size *
-                     torch.float32.itemsize)),
+                hidden_dim_bytes=hidden_dim_bytes,
+                hidden_dim_scale_bytes=hidden_scale_bytes,
             )
 
             # Intranode pplx a2a takes a group name while internode does not.
@@ -335,9 +121,6 @@ class FusedMoEMethodBase(QuantizeMethodBase):
 
             handle = all2all_manager.get_handle(all_to_all_args)
 
-            input_activations = get_quant_config_input_activations(
-                quant_config)
-
             prepare_finalize = PplxPrepareAndFinalize(
                 handle,
                 max_num_tokens=moe.max_num_tokens,
@@ -345,10 +128,6 @@ class FusedMoEMethodBase(QuantizeMethodBase):
                 rank=all2all_manager.rank,
                 # dp_size actually means tp_size, bug in pplx kernels
                 dp_size=all2all_manager.tp_group.world_size,
-                quant_dtype=moe.quant_dtype,
-                per_act_token=(input_activations.strategy
-                               == QuantizationStrategy.TOKEN
-                               if input_activations is not None else False),
             )
         elif moe.use_deepep_ht_kernels:
             assert moe.dp_size == all2all_manager.dp_world_size
@@ -362,8 +141,6 @@ class FusedMoEMethodBase(QuantizeMethodBase):
                 dp_size=all2all_manager.dp_world_size,
                 rank_expert_offset=all2all_manager.rank *
                 moe.num_local_experts,
-                quant_dtype=quant_dtype,
-                block_shape=act_quant_block_size,
             )
 
         elif moe.use_deepep_ll_kernels:
@@ -380,25 +157,25 @@ class FusedMoEMethodBase(QuantizeMethodBase):
 
             # Note : We may want to use FP8 dispatch even otherwise just to
             # reduce datamovement
-            assert act_quant_block_size is not None
-            use_fp8_dispatch = (quant_dtype == current_platform.fp8_dtype()
-                                and act_quant_block_size[1]
-                                == DEEPEP_QUANT_BLOCK_SIZE)
+            use_fp8_dispatch = (moe.quant_config is not None
+                                and moe.quant_config.quant_dtype
+                                == current_platform.fp8_dtype()
+                                and moe.quant_config.block_shape
+                                == DEEPEP_QUANT_BLOCK_SHAPE)
 
             # Note (varun): Whether to use FP8 dispatch or not needs some
             # profiling. Turning it off for now.
             prepare_finalize = DeepEPLLPrepareAndFinalize(
                 handle,
+                max_tokens_per_rank=moe.max_num_tokens,
                 world_size=all2all_manager.world_size,
                 dp_size=all2all_manager.dp_world_size,
-                max_tokens_per_rank=moe.max_num_tokens,
-                quant_dtype=quant_dtype,
-                block_shape=act_quant_block_size,
                 use_fp8_dispatch=use_fp8_dispatch,
             )
 
         self.topk_indices_dtype = None
         if prepare_finalize is not None:
+            logger.debug("%s", prepare_finalize.__class__.__name__)
             self.topk_indices_dtype = prepare_finalize.topk_indices_dtype()
             experts = self.select_gemm_impl(prepare_finalize, moe)
             self.fused_experts = FusedMoEModularKernel(
@@ -407,13 +184,15 @@ class FusedMoEMethodBase(QuantizeMethodBase):
             )
 
     def select_gemm_impl(
-            self, prepare_finalize: FusedMoEPrepareAndFinalize,
-            moe: Optional[MoEConfig]) -> FusedMoEPermuteExpertsUnpermute:
+        self,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        moe: FusedMoEConfig,
+    ) -> FusedMoEPermuteExpertsUnpermute:
         # based on the all2all implementation, select the appropriate
         # gemm implementation
         raise NotImplementedError(
-            "Subclass must select appropriate gemm implementation"
-            " based on the prepare_finalize")
+            f"{self.__class__.__name__} must select appropriate gemm "
+            "implementation based on the prepare_finalize")
 
     @abstractmethod
     def apply(
@@ -445,7 +224,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
 class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     """MoE method without quantization."""
 
-    def __init__(self, moe: MoEConfig):
+    def __init__(self, moe: FusedMoEConfig):
         super().__init__()
         self.fused_experts = fused_experts  # type: ignore
         self.topk_indices_dtype = None
@@ -458,44 +237,30 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         else:
             self.rocm_aiter_fused_experts = None  # type: ignore
 
-    def select_gemm_impl(self, prepare_finalize: FusedMoEPrepareAndFinalize,
-                         moe: Optional[MoEConfig]):
+    def select_gemm_impl(
+        self,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        moe: FusedMoEConfig,
+    ) -> FusedMoEPermuteExpertsUnpermute:
 
         assert self.fused_experts == fused_experts
 
         all2all_manager = get_ep_group().device_communicator.all2all_manager
         assert all2all_manager is not None
 
-        experts: Optional[FusedMoEPermuteExpertsUnpermute] = None
-
-        use_batched_experts = prepare_finalize.max_num_tokens_per_rank(
-        ) is not None
-        if use_batched_experts:
+        if (prepare_finalize.activation_format ==
+                FusedMoEActivationFormat.BatchedExperts):
             logger.debug("BatchedTritonExperts %s", self.moe)
             assert self.moe.dp_size == all2all_manager.dp_world_size
-            experts = BatchedTritonExperts(
+            return BatchedTritonExperts(
                 max_num_tokens=self.moe.max_num_tokens,
                 world_size=all2all_manager.world_size,
                 # dp_size actually means tp_size, bug in pplx kernels
                 dp_size=all2all_manager.tp_group.world_size,
-                use_fp8_w8a8=False,
-                use_int8_w8a8=False,
-                use_int8_w8a16=False,
-                use_int4_w4a16=False,
-                block_shape=None,
-                per_channel_quant=False,
             )
         else:
             logger.debug("TritonExperts %s", self.moe)
-            experts = TritonExperts(
-                use_fp8_w8a8=False,
-                use_int8_w8a8=False,
-                use_int8_w8a16=False,
-                use_int4_w4a16=False,
-                block_shape=None,
-                per_channel_quant=False,
-            )
-        return experts
+            return TritonExperts()
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
@@ -883,13 +648,18 @@ class FusedMoE(torch.nn.Module):
             params_dtype = torch.get_default_dtype()
         self.params_dtype = params_dtype
 
+        tp_size_ = (tp_size if tp_size is not None else
+                    get_tensor_model_parallel_world_size())
+        dp_size_ = (dp_size
+                    if dp_size is not None else get_dp_group().world_size)
+        world_size_ = get_world_group().world_size
+
         vllm_config = get_current_vllm_config()
         self.moe_parallel_config: FusedMoEParallelConfig = (
             FusedMoEParallelConfig.make(
-                tp_size_=(tp_size if tp_size is not None else
-                          get_tensor_model_parallel_world_size()),
-                dp_size_=(dp_size if dp_size is not None else
-                          get_dp_group().world_size),
+                tp_size_=tp_size_,
+                dp_size_=dp_size_,
+                world_size_=world_size_,
                 vllm_parallel_config=vllm_config.parallel_config))
 
         self.global_num_experts = num_experts + num_redundant_experts
@@ -948,25 +718,22 @@ class FusedMoE(torch.nn.Module):
             from vllm_hpu_extension.ops import DynamicFusedMOE
             self.hpu_fused_moe = DynamicFusedMOE(self.global_num_experts)
 
-        # Only support float8 for now.
-        quant_dtype = params_dtype
-        if quant_config is not None:
-            input_activations = get_quant_config_input_activations(
-                quant_config)
-            if (input_activations is not None
-                    and input_activations.num_bits == 8
-                    and input_activations.type == QuantizationType.FLOAT):
-                quant_dtype = torch.float8_e4m3fn
+        if vllm_config.model_config is not None:
+            model_dtype = vllm_config.model_config.dtype
+        else:
+            # TODO (bnell): This is a hack to get test_mixtral_moe to work
+            # since model_config is not set in the pytest test.
+            model_dtype = params_dtype
 
-        moe = MoEConfig(
+        moe = FusedMoEConfig.make(
             num_experts=self.global_num_experts,
             experts_per_token=top_k,
             hidden_dim=hidden_size,
             num_local_experts=self.local_num_experts,
             moe_parallel_config=self.moe_parallel_config,
-            in_dtype=params_dtype,
-            quant_dtype=quant_dtype,
+            in_dtype=model_dtype,
             max_num_tokens=envs.VLLM_MOE_DP_CHUNK_SIZE,
+            quant_config=quant_config,
         )
         self.moe_config = moe
         self.quant_config = quant_config
@@ -1017,16 +784,15 @@ class FusedMoE(torch.nn.Module):
         self.batched_router_logits: Optional[torch.Tensor] = None
         if (self.moe_parallel_config.use_pplx_kernels
                 or self.moe_parallel_config.use_deepep_ll_kernels):
-            act_dtype = vllm_config.model_config.dtype
             self.batched_hidden_states = torch.zeros(
-                (envs.VLLM_MOE_DP_CHUNK_SIZE, self.hidden_size),
-                dtype=act_dtype,
+                (moe.max_num_tokens, self.hidden_size),
+                dtype=moe.in_dtype,
                 device=torch.cuda.current_device())
 
             # Note here we use `num_experts` which is logical expert count
             self.batched_router_logits = torch.zeros(
-                (envs.VLLM_MOE_DP_CHUNK_SIZE, num_experts),
-                dtype=act_dtype,
+                (moe.max_num_tokens, num_experts),
+                dtype=moe.in_dtype,
                 device=torch.cuda.current_device())
 
     @property
@@ -1588,7 +1354,7 @@ class FusedMoE(torch.nn.Module):
 
             assert (self.batched_hidden_states.size(0)  # type: ignore
                     >= chunk_size)
-            assert (self.batched_router_logits.size(0)  # type: ignore 
+            assert (self.batched_router_logits.size(0)  # type: ignore
                     >= chunk_size)
             staged_hidden_states = self.batched_hidden_states[:
                                                               chunk_size, :]  # type: ignore
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index d25d70d3eff15..2ffb4d328eca5 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -1,12 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
+from enum import Enum
 from math import prod
-from typing import Optional
+from typing import Optional, final
 
 import torch
 
 import vllm.envs as envs
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 from vllm.utils import cdiv
 
@@ -82,6 +84,18 @@ def _moe_problem_size(
     return E, M, N, K, topk
 
 
+class FusedMoEActivationFormat(Enum):
+    """
+    The standard activation format (num_tokens, hidden dim).
+    """
+    Standard = "standard",
+    """
+    The batched experts format (num experts, max tokens per expert, hidden dim)
+    """
+    BatchedExperts = "batched_experts",
+
+
+# TODO: pass FusedMoEParallelConfig in as ctor parameter?
 class FusedMoEPrepareAndFinalize(ABC):
     """
     An abstract base class for the [Quantize-Prepare] and [Finalize] steps
@@ -99,6 +113,7 @@ class FusedMoEPrepareAndFinalize(ABC):
         num_experts: int,
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
+        quant_config: FusedMoEQuantConfig,
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
                Optional[torch.Tensor], Optional[torch.Tensor]]:
         """
@@ -148,6 +163,15 @@ class FusedMoEPrepareAndFinalize(ABC):
         """
         raise NotImplementedError
 
+    @property
+    @abstractmethod
+    def activation_format(self) -> FusedMoEActivationFormat:
+        """
+        A property indicating the output format of the activations for the
+        'prepare' method.
+        """
+        raise NotImplementedError
+
     @abstractmethod
     def topk_indices_dtype(self) -> Optional[torch.dtype]:
         """
@@ -176,6 +200,41 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
     above.
     """
 
+    def __init__(
+        self,
+        quant_config: Optional[FusedMoEQuantConfig],
+    ):
+        if quant_config is not None:
+            self.quant_config = quant_config
+        else:
+            self.quant_config = FusedMoEQuantConfig()
+
+    @property
+    @abstractmethod
+    def activation_formats(
+            self) -> tuple[FusedMoEActivationFormat, FusedMoEActivationFormat]:
+        """
+        A property which is a tuple of the input and output activation formats
+        for the 'apply' method.
+        """
+        raise NotImplementedError
+
+    @property
+    def quant_dtype(self) -> Optional[torch.dtype]:
+        return self.quant_config.quant_dtype
+
+    @property
+    def block_shape(self) -> Optional[list[int]]:
+        return self.quant_config.block_shape
+
+    @property
+    def per_act_token_quant(self) -> bool:
+        return self.quant_config.per_act_token_quant
+
+    @property
+    def per_out_ch_quant(self) -> bool:
+        return self.quant_config.per_out_ch_quant
+
     # TODO (bnell): make this return a CHUNK_SIZE or None instead?
     @abstractmethod
     def supports_chunking(self) -> bool:
@@ -185,6 +244,13 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
         """
         raise NotImplementedError
 
+    @abstractmethod
+    def supports_expert_map(self) -> bool:
+        """
+        A flag indicating whether or not this class supports expert maps
+        """
+        raise NotImplementedError
+
     @abstractmethod
     def workspace_shapes(
         self,
@@ -297,6 +363,7 @@ def _chunk_scales(scales: Optional[torch.Tensor], start: int,
     return None
 
 
+@final
 class FusedMoEModularKernel(torch.nn.Module):
     """
     This class combines a FusedMoEPrepareAndFinalize instance and
@@ -318,6 +385,12 @@ class FusedMoEModularKernel(torch.nn.Module):
         super().__init__()
         self.prepare_finalize = prepare_finalize
         self.fused_experts = fused_experts
+        assert prepare_finalize.activation_format == \
+            fused_experts.activation_formats[0], (
+                f"{prepare_finalize.__class__.__name__}."
+                f"{prepare_finalize.activation_format} == "
+                f"{fused_experts.__class__.__name__}."
+                f"{fused_experts.activation_formats[0]}")
 
     def forward(
         self,
@@ -383,8 +456,16 @@ class FusedMoEModularKernel(torch.nn.Module):
 
         (a1q, a1q_scale, expert_num_tokens, _expert_topk_ids,
          _expert_topk_weights) = self.prepare_finalize.prepare(
-             a1, a1_scale, a2_scale, topk_weights, topk_ids,
-             global_num_experts, expert_map, apply_router_weight_on_input)
+             a1,
+             a1_scale,
+             a2_scale,
+             topk_weights,
+             topk_ids,
+             global_num_experts,
+             expert_map,
+             apply_router_weight_on_input,
+             self.fused_experts.quant_config,
+         )
 
         # Maybe prepare gathered topk_ids and topk_weights from other EP ranks.
         topk_ids = topk_ids if _expert_topk_ids is None else _expert_topk_ids
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
index 2ff8ef99b2ec8..45e813287d3fb 100644
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -6,33 +6,76 @@ import pplx_kernels as pplx
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.utils import (
     moe_kernel_quantize_input)
+from vllm.utils import cdiv, round_up
+
+
+def pplx_hidden_dim_scale_bytes(
+    max_num_tokens: int,
+    hidden_dim: int,
+    in_dtype: torch.dtype,
+    quant_dtype: Optional[torch.dtype],
+    per_act_token_quant: bool,
+    block_shape: Optional[list[int]],
+):
+    # All pplx byte sizes must be 16-byte aligned.
+    align = 16
+
+    # For blocked per token: set to
+    #   ceil_div(hidden_dim, block_size) * sizeof(float32)
+    # For per-token: set to 4 * sizeof(float32) (x4 for alignment)
+    if quant_dtype is not None:
+        assert quant_dtype.itemsize == 1
+        hidden_dim_bytes = hidden_dim * quant_dtype.itemsize
+        elem_size = torch.float32.itemsize
+
+        if per_act_token_quant:
+            # per-token
+            assert block_shape is None
+            hidden_scale_bytes = elem_size
+        elif block_shape is not None:
+            # per-group
+            block_size = block_shape[1]
+            num_blocks = cdiv(hidden_dim, block_size)
+            hidden_scale_bytes = num_blocks * elem_size
+        else:
+            # per-tensor
+            hidden_scale_bytes = elem_size
+    else:
+        hidden_dim_bytes = hidden_dim * in_dtype.itemsize
+        hidden_scale_bytes = 0
+
+    return (
+        round_up(hidden_dim_bytes, align),
+        round_up(hidden_scale_bytes, align),
+    )
 
 
 # The max_num_tokens, world_size and dp_size must be the same
 # as the ones used to create the AllToAll.
 class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
-    def __init__(self,
-                 a2a: pplx.AllToAll,
-                 max_num_tokens: int,
-                 world_size: int,
-                 rank: int,
-                 dp_size: int,
-                 quant_dtype: Optional[torch.dtype] = None,
-                 block_shape: Optional[list[int]] = None,
-                 per_act_token: bool = False):
+    def __init__(
+        self,
+        a2a: pplx.AllToAll,
+        max_num_tokens: int,
+        world_size: int,
+        rank: int,
+        dp_size: int,
+    ):
         super().__init__()
         assert max_num_tokens > 0
         self.a2a = a2a
-        self.block_shape = block_shape
         self.max_num_tokens = max_num_tokens
         self.world_size = world_size
         self.rank = rank
         self.dp_size = dp_size
-        self.quant_dtype = quant_dtype
-        self.per_act_token = per_act_token
+
+    @property
+    def activation_format(self) -> mk.FusedMoEActivationFormat:
+        return mk.FusedMoEActivationFormat.BatchedExperts
 
     def max_num_tokens_per_rank(self) -> Optional[int]:
         return self.max_num_tokens
@@ -45,36 +88,43 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         a1: torch.Tensor,
         a1_scale: Optional[torch.Tensor],
         a2_scale: Optional[torch.Tensor],
-        rank_topk_weights: torch.Tensor,
-        rank_topk_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
         num_experts: int,
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
+        quant_config: FusedMoEQuantConfig,
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
                Optional[torch.Tensor], Optional[torch.Tensor]]:
         num_tokens = a1.size(0)  # M
         hidden_dim = a1.size(-1)  # K
 
-        assert rank_topk_ids.size(0) == num_tokens
+        assert topk_ids.size(0) == num_tokens
         # assert expert_map is None, "NYI"
 
         # Is this always going to be a1.device?
         device = a1.device
 
         if apply_router_weight_on_input:
-            topk = rank_topk_ids.size(1)
+            topk = topk_ids.size(1)
             # TODO: this only works for topK=1, will need to update for topK>1
             assert topk == 1, (
                 "apply_router_weight_on_input is only implemented for topk=1")
-            a1 = a1 * rank_topk_weights.to(a1.dtype)
+            a1 = a1 * topk_weights.to(a1.dtype)
 
         repeat_cols = 4
-        repeat_rows = 1 if self.per_act_token else a1.size(0)
+        repeat_rows = 1 if quant_config.per_act_token_quant else a1.size(0)
         a1q, a1q_scale = moe_kernel_quantize_input(
-            a1, (None if self.per_act_token else a1_scale), self.quant_dtype,
-            self.per_act_token, self.block_shape)
+            a1, (None if quant_config.per_act_token_quant else a1_scale),
+            quant_dtype=quant_config.quant_dtype,
+            per_act_token_quant=quant_config.per_act_token_quant,
+            block_shape=quant_config.block_shape)
 
         if a1q_scale is not None:
+            if a1q_scale.numel() == 1:
+                orig_a_scale_block_shape = 1
+            else:
+                orig_a_scale_block_shape = a1q_scale.shape[-1]
             a1q_scale = a1q_scale.repeat(repeat_rows, repeat_cols)
 
         # rem_experts need to be 0 for pplx to work properly.
@@ -98,15 +148,12 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
         expert_x_scale: Optional[torch.Tensor] = None
         if a1q.dtype.itemsize == 1:
-            float32_size = torch.float32.itemsize
-            block_size = (self.block_shape[0] if self.block_shape is not None
-                          else 1) * float32_size
+            block_size = (quant_config.block_shape[1]
+                          if quant_config.block_shape is not None else 1)
             expert_x_scale = torch.empty(
-                (
-                    num_local_experts,
-                    expert_x.size(1),
-                    (expert_x.size(2) + block_size - 1) // block_size,
-                ),
+                (num_local_experts, expert_x.size(1),
+                 round_up(
+                     (expert_x.size(2) + block_size - 1) // block_size, 4)),
                 dtype=torch.float32,
                 device=device,
             )
@@ -121,11 +168,11 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             out_expert_x_scale=expert_x_scale,
             dp_x=a1q,
             dp_x_scale=a1q_scale,
-            indices=rank_topk_ids,
+            indices=topk_ids,
             bound_m=bound_m,
         )
         if expert_x_scale is not None:
-            expert_x_scale = expert_x_scale[:, :, 0:1]
+            expert_x_scale = expert_x_scale[:, :, :orig_a_scale_block_shape]
 
         return expert_x, expert_x_scale, expert_num_tokens, None, None
 
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
index 9ed95e1de9fed..9e4be82f6c1f4 100644
--- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -5,6 +5,7 @@ from typing import Optional
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
     _moe_unpermute_and_reduce)
 from vllm.model_executor.layers.fused_moe.utils import (
@@ -13,16 +14,9 @@ from vllm.model_executor.layers.fused_moe.utils import (
 
 class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
 
-    def __init__(
-        self,
-        quant_dtype: Optional[torch.dtype] = None,
-        per_channel_quant: bool = False,
-        block_shape: Optional[list[int]] = None,
-    ):
-        super().__init__()
-        self.per_channel_quant = per_channel_quant
-        self.block_shape = block_shape
-        self.quant_dtype = quant_dtype
+    @property
+    def activation_format(self) -> mk.FusedMoEActivationFormat:
+        return mk.FusedMoEActivationFormat.Standard
 
     def max_num_tokens_per_rank(self) -> Optional[int]:
         return None
@@ -39,7 +33,8 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
         topk_ids: torch.Tensor,
         num_experts: int,
         expert_map: Optional[torch.Tensor],
-        apply_router_weight_on_input: bool = False,
+        apply_router_weight_on_input: bool,
+        quant_config: FusedMoEQuantConfig,
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
                Optional[torch.Tensor], Optional[torch.Tensor]]:
 
@@ -50,10 +45,9 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
                 "apply_router_weight_on_input is only implemented for topk=1"
             a1.mul_(topk_weights.to(a1.dtype))
 
-        a1q, a1q_scale = moe_kernel_quantize_input(a1, a1_scale,
-                                                   self.quant_dtype,
-                                                   self.per_channel_quant,
-                                                   self.block_shape)
+        a1q, a1q_scale = moe_kernel_quantize_input(
+            a1, a1_scale, quant_config.quant_dtype,
+            quant_config.per_act_token_quant, quant_config.block_shape)
 
         return a1q, a1q_scale, None, None, None
 
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
index 4bbfea446e291..e660376ebe6b7 100644
--- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -5,6 +5,7 @@ from typing import Optional
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
     DeepGemmExperts, _valid_deep_gemm, _valid_deep_gemm_shape)
 from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
@@ -12,34 +13,59 @@ from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
 
 class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
-    def __init__(self,
-                 use_fp8_w8a8: bool = False,
-                 use_int8_w8a8: bool = False,
-                 use_int8_w8a16: bool = False,
-                 use_int4_w4a16: bool = False,
-                 per_channel_quant: bool = False,
-                 block_shape: Optional[list[int]] = None,
-                 block_m: Optional[int] = None,
-                 allow_deep_gemm: bool = False):
-        super().__init__()
-        self.triton_expert = TritonExperts(use_fp8_w8a8=use_fp8_w8a8,
-                                           use_int8_w8a8=use_int8_w8a8,
-                                           use_int4_w4a16=use_int4_w4a16,
-                                           use_int8_w8a16=use_int8_w8a16,
-                                           per_channel_quant=per_channel_quant,
-                                           block_shape=block_shape,
-                                           block_m=block_m)
-        self.allow_deep_gemm = allow_deep_gemm
-        self.use_fp8_w8a8 = use_fp8_w8a8
+    def __init__(
+        self,
+        use_fp8_w8a8: bool = False,
+        use_int8_w8a8: bool = False,
+        use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
+        per_act_token_quant: bool = False,
+        block_shape: Optional[list[int]] = None,
+        allow_deep_gemm: bool = False,
+    ):
+        super().__init__(
+            FusedMoEQuantConfig.make(
+                use_fp8_w8a8=use_fp8_w8a8,
+                use_int8_w8a8=use_int8_w8a8,
+                use_int8_w8a16=use_int8_w8a16,
+                use_int4_w4a16=use_int4_w4a16,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+            ))
+        self.triton_expert = TritonExperts(
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
+            use_int4_w4a16=use_int4_w4a16,
+            use_int8_w8a16=use_int8_w8a16,
+            per_act_token_quant=per_act_token_quant,
+            block_shape=block_shape,
+        )
+        self.allow_deep_gemm = (allow_deep_gemm and not per_act_token_quant
+                                and use_fp8_w8a8)
         self.deep_gemm_expert = DeepGemmExperts(
         ) if self.allow_deep_gemm else None
 
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        assert (self.deep_gemm_expert is None
+                or self.triton_expert.activation_formats
+                == self.deep_gemm_expert.activation_formats)
+        return self.triton_expert.activation_formats
+
     def supports_chunking(self) -> bool:
         dge = self.deep_gemm_expert
         te = self.triton_expert
         return ((dge is None or dge.supports_chunking())
                 and (te is None or te.supports_chunking()))
 
+    def supports_expert_map(self) -> bool:
+        dge = self.deep_gemm_expert
+        te = self.triton_expert
+        return ((dge is None or dge.supports_expert_map())
+                and (te is None or te.supports_expert_map()))
+
     def workspace_shapes(
         self,
         a: torch.Tensor,
@@ -83,9 +109,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         workspace2: torch.Tensor,
         expert_num_tokens: Optional[torch.Tensor],
     ):
-        N = w1.size(1)
-
-        use_deep_gemm = (self.allow_deep_gemm and self.use_fp8_w8a8 and N > 512
+        use_deep_gemm = (self.allow_deep_gemm
                          and _valid_deep_gemm(hidden_states, w1, w2))
 
         experts = self.deep_gemm_expert if use_deep_gemm else self.triton_expert
diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py
index 692482c2ea692..52346f7974405 100644
--- a/vllm/model_executor/layers/fused_moe/utils.py
+++ b/vllm/model_executor/layers/fused_moe/utils.py
@@ -37,6 +37,7 @@ def _fp8_quantize(
         A, A_scale = ops.scaled_fp8_quant(
             A, A_scale, use_per_token_if_dynamic=per_act_token)
     else:
+        assert not per_act_token
         assert len(block_shape) == 2
         _, block_k = block_shape[0], block_shape[1]
         A, A_scale = per_token_group_quant_fp8(A, block_k)
@@ -64,6 +65,7 @@ def _int8_quantize(
             "int8 quantization only supports block or channel-wise"
         A, A_scale = per_token_quant_int8(A)
     else:
+        assert not per_act_token
         assert len(block_shape) == 2
         _, block_k = block_shape[0], block_shape[1]
         A, A_scale = per_token_group_quant_int8(A, block_k)
@@ -75,16 +77,15 @@ def _int8_quantize(
 def moe_kernel_quantize_input(
     A: torch.Tensor,
     A_scale: Optional[torch.Tensor],
-    qtype: Optional[torch.dtype],
-    per_channel_quant: bool,
+    quant_dtype: Optional[torch.dtype],
+    per_act_token_quant: bool,
     block_shape: Optional[list[int]] = None,
 ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-    if qtype == torch.float8_e4m3fn:
-        return _fp8_quantize(A, A_scale, per_channel_quant, block_shape)
-    elif qtype == torch.int8:
-        return _int8_quantize(A, A_scale, per_channel_quant, block_shape)
+    if quant_dtype == torch.float8_e4m3fn:
+        return _fp8_quantize(A, A_scale, per_act_token_quant, block_shape)
+    elif quant_dtype == torch.int8:
+        return _int8_quantize(A, A_scale, per_act_token_quant, block_shape)
     else:
-        assert A_scale is None
         return A, A_scale
 
 
@@ -96,3 +97,17 @@ def _fp8_perm(m: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
         return m.view(dtype=torch.uint8)[idx, ...].view(dtype=m.dtype)
     else:
         return m[idx, ...]
+
+
+# TODO(bnell): better name
+def maybe_fix_scales(scales: Optional[torch.Tensor],
+                     num_experts: int) -> Optional[torch.Tensor]:
+    if scales is not None and scales.ndim < 3:
+        if scales.numel() == 1:
+            scales = scales.view(1)
+            scales = torch.repeat_interleave(scales, num_experts,
+                                             dim=0).view(num_experts, 1, 1)
+        else:
+            scales = scales.view(num_experts, -1, scales.size(-1))
+
+    return scales
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 92b82f5a02ffb..fa011266cf2fe 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -13,8 +13,10 @@ from compressed_tensors.quantization import (ActivationOrdering,
 import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
-                                                  FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.fused_moe import (
+    CutlassExpertsFp8, FusedMoE, FusedMoEActivationFormat, FusedMoEConfig,
+    FusedMoEMethodBase, FusedMoEPermuteExpertsUnpermute,
+    FusedMoEPrepareAndFinalize, FusedMoeWeightScaleSupported, fused_experts)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import (  # noqa
     WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
@@ -32,14 +34,6 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
-from vllm.utils import has_pplx
-
-if current_platform.is_cuda_alike():
-    from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-        BatchedPrepareAndFinalize)
-    if has_pplx():
-        from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
-            PplxPrepareAndFinalize)
 
 logger = init_logger(__name__)
 
@@ -569,15 +563,14 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                                                  requires_grad=False)
 
             self.rocm_aiter_fused_experts_func = rocm_aiter_fused_experts
-        else:
-            from vllm.model_executor.layers.fused_moe import fused_experts
-            self.fused_experts_func = fused_experts
-
-        if self.use_marlin:
+        elif self.use_marlin:
             prepare_moe_fp8_layer_for_marlin(layer, False)
             # Activations not quantized for marlin.
             del layer.w13_input_scale
             del layer.w2_input_scale
+            self.fused_experts_func = None
+        else:
+            self.fused_experts_func = fused_experts
 
     def apply(
         self,
@@ -653,6 +646,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 global_num_experts=global_num_experts,
                 expert_map=expert_map)
 
+        assert self.fused_experts_func is not None
+
         return self.fused_experts_func(
             hidden_states=x,
             w1=layer.w13_weight,
@@ -826,28 +821,27 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
             layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
                                                         requires_grad=False)
 
-    def select_gemm_impl(self, prepare_finalize, moe):
-        from vllm.model_executor.layers.fused_moe.cutlass_moe import (
-            CutlassExpertsFp8)
+    def select_gemm_impl(
+        self,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        moe: FusedMoEConfig,
+    ) -> FusedMoEPermuteExpertsUnpermute:
 
-        assert moe is not None
+        use_batched_format = (prepare_finalize.activation_format ==
+                              FusedMoEActivationFormat.BatchedExperts)
+
+        num_experts = (moe.num_local_experts
+                       if use_batched_format else moe.num_experts)
 
-        max_experts_per_worker = (
-            (moe.num_experts + prepare_finalize.world_size - 1) //
-            prepare_finalize.world_size)
         experts = CutlassExpertsFp8(
-            max_experts_per_worker,
+            num_experts,
             moe.in_dtype,
             self.input_quant.strategy == QuantizationStrategy.TOKEN,
             self.weight_quant.strategy == QuantizationStrategy.CHANNEL,
-            use_batched_format=True,
+            use_batched_format=use_batched_format,
         )
 
-        if has_pplx() and isinstance(
-                prepare_finalize,
-            (BatchedPrepareAndFinalize, PplxPrepareAndFinalize)):
-            # no expert_map support in this case
-            self.disable_expert_map = True
+        self.disable_expert_map = not experts.supports_expert_map()
         return experts
 
     def apply(
@@ -888,7 +882,8 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias,
-            indices_type=torch.uint32)
+            indices_type=self.topk_indices_dtype,
+        )
 
         return self.fused_experts(
             x,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index ead345c794b8b..0295f5e2a1c88 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import functools
-from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Optional
 
 import torch
 import torch.nn.functional as F
@@ -13,8 +13,11 @@ import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
-                                                  FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.fused_moe import (
+    BatchedTritonOrDeepGemmExperts, FusedMoE, FusedMoEActivationFormat,
+    FusedMoEConfig, FusedMoEMethodBase, FusedMoEPermuteExpertsUnpermute,
+    FusedMoEPrepareAndFinalize, FusedMoeWeightScaleSupported,
+    TritonOrDeepGemmExperts)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -777,44 +780,46 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             del layer.w13_input_scale
             del layer.w2_input_scale
 
-    def select_gemm_impl(self, prepare_finalize, moe):
-
-        from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
-            BatchedTritonOrDeepGemmExperts)
-        from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
-            TritonOrDeepGemmExperts)
-
+    def select_gemm_impl(
+        self,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        moe: FusedMoEConfig,
+    ) -> FusedMoEPermuteExpertsUnpermute:
         assert not self.use_marlin and not self.rocm_aiter_moe_enabled, (
             "Marlin and ROCm AITER are not supported with all2all yet.")
 
-        experts: Optional[Union[BatchedTritonOrDeepGemmExperts,
-                                TritonOrDeepGemmExperts]] = None
-        max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
-        use_batched_experts = max_num_tokens_per_rank is not None
-
-        if use_batched_experts:
-            experts = BatchedTritonOrDeepGemmExperts(
+        if (prepare_finalize.activation_format ==
+                FusedMoEActivationFormat.BatchedExperts):
+            max_num_tokens_per_rank = (
+                prepare_finalize.max_num_tokens_per_rank())
+            assert max_num_tokens_per_rank is not None
+            logger.debug(
+                "BatchedTritonOrDeepGemmExperts(%s): "
+                "max_tokens_per_rank=%s, block_size=%s, per_act_token=%s",
+                self.__class__.__name__, max_num_tokens_per_rank,
+                self.quant_config.weight_block_size, False)
+            return BatchedTritonOrDeepGemmExperts(
                 max_num_tokens=max_num_tokens_per_rank,
-                world_size=prepare_finalize.world_size,
-                dp_size=prepare_finalize.dp_size,
+                world_size=prepare_finalize.
+                world_size,  # type: ignore [attr-defined]
+                dp_size=prepare_finalize.
+                dp_size,  # type: ignore [attr-defined]
                 use_fp8_w8a8=True,
-                use_int8_w8a8=False,
-                use_int8_w8a16=False,
-                use_int4_w4a16=False,
-                per_channel_quant=False,
                 block_shape=self.quant_config.weight_block_size,
+                per_act_token_quant=False,
                 allow_deep_gemm=self.allow_deep_gemm,
             )
         else:
-            experts = TritonOrDeepGemmExperts(
+            logger.debug(
+                "TritonOrDeepGemmExperts(%s): block_size=%s, per_act_token=%s",
+                self.__class__.__name__, self.quant_config.weight_block_size,
+                False)
+            return TritonOrDeepGemmExperts(
                 use_fp8_w8a8=True,
                 block_shape=self.quant_config.weight_block_size,
                 allow_deep_gemm=self.allow_deep_gemm,
             )
 
-        assert experts is not None
-        return experts
-
     def apply(
         self,
         layer: torch.nn.Module,

From 48fb076cbc651f655aae8ffdff6930907309b2f7 Mon Sep 17 00:00:00 2001
From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com>
Date: Wed, 2 Jul 2025 12:10:42 -0400
Subject: [PATCH 051/167] [V1] LogitsProcessor programming model (#16728)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Andrew Feldman <afeldman@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 626 ++++++++++++++++++++++
 tests/v1/sample/test_logprobs_e2e.py      |   5 +-
 tests/v1/sample/test_rejection_sampler.py |   5 +-
 tests/v1/sample/test_sampler.py           | 154 +-----
 tests/v1/sample/utils.py                  |  81 ++-
 tests/v1/worker/test_gpu_input_batch.py   |  55 +-
 vllm/v1/sample/logits_processor.py        | 516 ++++++++++++++++++
 vllm/v1/sample/metadata.py                |  11 +-
 vllm/v1/sample/ops/penalties.py           |  16 -
 vllm/v1/sample/sampler.py                 |  82 +--
 vllm/v1/spec_decode/utils.py              |  25 +-
 vllm/v1/worker/gpu_input_batch.py         | 158 +++---
 vllm/v1/worker/gpu_model_runner.py        |  60 +--
 13 files changed, 1401 insertions(+), 393 deletions(-)
 create mode 100644 tests/v1/sample/test_logits_processors.py
 create mode 100644 vllm/v1/sample/logits_processor.py

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
new file mode 100644
index 0000000000000..a8e230a97ed54
--- /dev/null
+++ b/tests/v1/sample/test_logits_processors.py
@@ -0,0 +1,626 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import random
+from collections.abc import Callable
+from typing import NamedTuple, Optional, Union
+
+import numpy as np
+import pytest
+import torch
+
+from tests.v1.sample.utils import (LogitsprocsTestFakes, create_fake_logits,
+                                   create_penalty_tensor,
+                                   create_prompt_tokens_tensor,
+                                   fake_apply_logitsprocs,
+                                   fake_update_logitsprocs_state)
+from vllm.platforms import current_platform
+from vllm.sampling_params import SamplingParams
+from vllm.utils import is_pin_memory_available
+# yapf: disable
+from vllm.v1.sample.logits_processor import (BatchUpdate, BatchUpdateBuilder,
+                                             LogitBiasLogitsProcessor,
+                                             LogitsProcessor,
+                                             MinPLogitsProcessor,
+                                             MinTokensLogitsProcessor,
+                                             MoveDirectionality,
+                                             init_builtin_logitsprocs)
+# yapf: enable
+from vllm.v1.sample.metadata import SamplingMetadata
+
+PIN_MEMORY_AVAILABLE = is_pin_memory_available()
+MAX_NUM_REQS = 256
+VOCAB_SIZE = 1024
+NUM_OUTPUT_TOKENS = 20
+CUDA_DEVICES = [
+    f"{current_platform.device_type}:{i}"
+    for i in range(1 if current_platform.device_count() == 1 else 2)
+]
+MAX_NUM_PROMPT_TOKENS = 64
+MIN_TOKENS_LEN_THRESHOLD = 5
+REQS_PER_LOGITPROC = 50
+STR_NO_LOGITPROC = "none"
+
+# LogitsProcessor subclass or "none"
+LogitprocType = Union[type[LogitsProcessor], str]
+
+
+class LogitsProcsRequestParams:
+    """Encapsulates key params for a single request in a batch.
+    
+    Params can be customized based on the enabled logitproc
+    """
+    workload_index: int
+    logitproc_type: LogitprocType  # Logitproc enabled, specified by str id
+    out_tokens: list[int]  # Output tokens required for min tokens test
+    params: SamplingParams  # Settings customized for logitproc
+
+    def __init__(self, workload_index: int, logitproc_type: LogitprocType):
+        self.workload_index = workload_index
+        self.logitproc_type = logitproc_type
+        # Number of output tokens is randomly 0 or twice the min-tokens
+        # threshold which will be used in testing. Output token values
+        # don't matter *for these tests* so use 0 as a dummy value
+        self.out_tokens = ([0] *
+                           (MIN_TOKENS_LEN_THRESHOLD * random.randint(0, 2)))
+        self.params = _sampling_params_from_logitproc(logitproc_type)
+
+    def __str__(self):
+        """For debugging"""
+        summ = ', '.join(f'{k}={v}' for k, v in vars(self).items())
+        return f"MyClass({summ})"
+
+
+def _generate_fake_sampling_metadata(
+    num_output_tokens: int,
+    batch_size: int,
+    vocab_size: int,
+    device: torch.device,
+) -> SamplingMetadata:
+    """Generate fake sampling metadata with fake logitsprocs"""
+    output_token_ids: list[list[int]] = []
+    prompt_token_ids: list[list[int]] = []
+    for _ in range(batch_size):
+        output_token_ids.append(
+            np.random.randint(0, vocab_size, size=num_output_tokens).tolist())
+        prompt_token_ids.append(
+            np.random.randint(0,
+                              vocab_size,
+                              size=np.random.randint(
+                                  1, MAX_NUM_PROMPT_TOKENS)).tolist())
+    logitsprocs = init_builtin_logitsprocs(
+        pin_memory_available=PIN_MEMORY_AVAILABLE,
+        max_num_reqs=MAX_NUM_REQS + 1,
+        device=device)
+
+    fake_sampling_metadata = SamplingMetadata(
+        temperature=torch.full((batch_size, ), 0.0),
+        all_greedy=True,
+        all_random=False,
+        top_p=None,
+        top_k=None,
+        generators={},
+        max_num_logprobs=0,
+        prompt_token_ids=create_prompt_tokens_tensor(prompt_token_ids,
+                                                     vocab_size, device),
+        output_token_ids=output_token_ids,
+        frequency_penalties=create_penalty_tensor(batch_size, 0.0, device),
+        presence_penalties=create_penalty_tensor(batch_size, 0.0, device),
+        repetition_penalties=create_penalty_tensor(batch_size, 1.0, device),
+        no_penalties=True,
+        allowed_token_ids_mask=None,
+        bad_words_token_ids={},
+        logitsprocs=logitsprocs)
+    return fake_sampling_metadata
+
+
+def _generate_test_fakes(batch_size: int, device: str) -> LogitsprocsTestFakes:
+    """Generate fake logits and sampling metadata"""
+    fake_logits = create_fake_logits(batch_size, VOCAB_SIZE)
+    # Create one dominant token per batch, to support min-p test
+    for i in range(batch_size):
+        fake_logits[i, 0] = 10.0  # High logit for first token
+        fake_logits[i, 1:] = 1e-2  # Others remain low
+    sampling_metadata = _generate_fake_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    return LogitsprocsTestFakes(
+        logits=fake_logits,
+        sampling_metadata=sampling_metadata,
+    )
+
+
+def _sampling_params_from_logitproc(
+        logitproc_type: LogitprocType) -> SamplingParams:
+    """Customize request SamplingParams for a specified logitproc"""
+    # SamplingParams for req with no logitproc
+    kwargs = {"min_p": 0.0, "logit_bias": None, "min_tokens": 0}
+    if fxn := logitsprocs_test_mapping[logitproc_type].gen_request_fxn:
+        fxn(kwargs)
+    return SamplingParams(**kwargs)
+
+
+def _generate_mixed_logitsprocs_batch_params(
+    reqs_per_logitproc: int,
+    logitsprocs_types: list[str],
+) -> list[LogitsProcsRequestParams]:
+    """Define key params for a batch of requests with a different
+    logitproc enabled per request.
+    
+    The batch will have `reqs_per_logitproc` repeats for all
+    `logitsprocs_types` under test, including the case where
+    no logitsproc is enabled. The batch is randomly shuffled. The
+    size of the batch is `reqs_per_logitproc` times
+    `n = len(logitsprocs_types)`
+
+    Args:
+      reqs_per_logitproc: number of requests using each logitproc
+      logitsprocs_types: logitsprocs under test
+
+    Returns:
+      List of per-request params which configure the engine for that request's
+      enabled logitproc
+    """
+    batch_size = len(logitsprocs_types) * reqs_per_logitproc
+    # Generate multiple repeats of key params for each logitproc;
+    # apply random inverse permutation to the iteration
+    # over logitsprocs, such that logitsprocs are shuffled.
+    batch_perm = random.sample(range(batch_size), k=batch_size)
+    return [
+        LogitsProcsRequestParams(
+            workload_index=idx,
+            logitproc_type=logitsprocs_types[pdx // reqs_per_logitproc])
+        for idx, pdx in enumerate(batch_perm)
+    ]
+
+
+def _raise_error_invalid(
+    msg_suffix: str,
+    batch_index: int,
+    request_params: LogitsProcsRequestParams,
+    step_idx: int,
+    err_cls: type[Exception] = ValueError,
+) -> None:
+    raise err_cls(f"Validation failed for step={step_idx}, "
+                  f"batch_index={batch_index}, "
+                  f"workload_index={request_params.workload_index}, "
+                  f"req_params={request_params}. Reason: {msg_suffix}")
+
+
+def _logit_bias_params(kwargs: dict) -> None:
+    """Logit bias config"""
+    kwargs["logit_bias"] = {
+        random.randint(0, VOCAB_SIZE - 1): random.choice([-0.1, 0.2])
+    }
+
+
+def _logit_bias_validate(
+    test_fakes: LogitsprocsTestFakes,
+    persistent_batch: list[LogitsProcsRequestParams],
+    logits_new: torch.Tensor,
+    batch_index: int,
+    request_params: LogitsProcsRequestParams,
+    step_idx: int,
+) -> None:
+    """Validate logit bias logitproc applied correctly"""
+    logit_bias = request_params.params.logit_bias
+    logits_old = (
+        test_fakes.logits[persistent_batch[batch_index].workload_index].cpu())
+    logits_new = logits_new[batch_index].cpu()
+    for token_id in range(VOCAB_SIZE):
+        logit_old_value = logits_old[token_id]
+        logit_new_value = logits_new[token_id]
+        if token_id in logit_bias:
+            bias_value = logit_bias[token_id]
+            exp_value = bias_value + logit_old_value
+            if logit_new_value != pytest.approx(exp_value):
+                _raise_error_invalid(msg_suffix=(
+                    f"Biased token {token_id} logit value {logit_new_value} "
+                    f"does not match expected value {exp_value} "
+                    f"given bias {bias_value}"),
+                                     batch_index=batch_index,
+                                     request_params=request_params,
+                                     step_idx=step_idx)
+
+        else:
+            if logit_new_value != pytest.approx(logit_old_value):
+                _raise_error_invalid(msg_suffix=(
+                    f"Unbiased token {token_id} logit value {logit_new_value} "
+                    f"does not match expected value {logit_old_value}"),
+                                     batch_index=batch_index,
+                                     request_params=request_params,
+                                     step_idx=step_idx)
+
+
+def _min_p_params(kwargs: dict) -> None:
+    """Min-p logitproc config"""
+    kwargs["min_p"] = 0.1
+
+
+def _min_p_validate(
+    test_fakes: LogitsprocsTestFakes,
+    persistent_batch: list[LogitsProcsRequestParams],
+    logits_new: torch.Tensor,
+    batch_index: int,
+    request_params: LogitsProcsRequestParams,
+    step_idx: int,
+) -> None:
+    """Validate min-p logitproc applied correctly"""
+    for token_id in range(VOCAB_SIZE):
+        logits_for_token = logits_new[batch_index][token_id]
+        if token_id == 0:
+            # Dominant token should always be unmasked
+            if logits_for_token == -float("inf"):
+                _raise_error_invalid(
+                    msg_suffix="Invalid: dominant token 0 masked (-inf)",
+                    batch_index=batch_index,
+                    request_params=request_params,
+                    step_idx=step_idx)
+        else:
+            if request_params.params.min_p > 0.0:
+                # Non-dominant tokens should be masked when min_p > 0
+                if logits_for_token != -float("inf"):
+                    _raise_error_invalid(
+                        msg_suffix=
+                        f"Invalid: non-dominant token {token_id} not masked",
+                        batch_index=batch_index,
+                        request_params=request_params,
+                        step_idx=step_idx)
+            else:
+                # No masking when min_p is 0
+                if logits_for_token == -float("inf"):
+                    _raise_error_invalid(
+                        msg_suffix=
+                        f"Invalid: token {token_id} masked when min_p=0.0",
+                        batch_index=batch_index,
+                        request_params=request_params,
+                        step_idx=step_idx)
+
+
+def _min_tokens_params(kwargs: dict) -> None:
+    """Min-tokens logitproc config"""
+    kwargs["min_tokens"] = MIN_TOKENS_LEN_THRESHOLD
+    kwargs["stop_token_ids"] = [
+        np.random.randint(0, VOCAB_SIZE - 1)
+        for _ in range(np.random.randint(0, VOCAB_SIZE))
+    ]
+
+
+def _min_tokens_validate(
+    test_fakes: LogitsprocsTestFakes,
+    persistent_batch: list[LogitsProcsRequestParams],
+    logits_new: torch.Tensor,
+    batch_index: int,
+    request_params: LogitsProcsRequestParams,
+    step_idx: int,
+) -> None:
+    """Validate min-tokens logitsproc applied correctly"""
+    ref_num_out_tokens = len(request_params.out_tokens)
+    min_reached = ref_num_out_tokens >= MIN_TOKENS_LEN_THRESHOLD
+    ref_all_stop_token_ids = request_params.params.all_stop_token_ids
+    mt_lp: MinTokensLogitsProcessor = next(
+        test_fakes.get_logitsprocs_by_cls(MinTokensLogitsProcessor))
+    assert isinstance(mt_lp, MinTokensLogitsProcessor)
+    min_tok = mt_lp.min_toks.get(batch_index, None)
+
+    # Validate min-token logits processor state
+    if min_tok:
+        (_, out_tok, all_stop_token_ids) = min_tok
+        num_out_tokens = len(out_tok)
+        if num_out_tokens != ref_num_out_tokens:
+            _raise_error_invalid(msg_suffix=(
+                "Number of output tokens in min-token logit processor "
+                f"request metadata ({num_out_tokens}) does not match "
+                f"reference ({ref_num_out_tokens})."),
+                                 batch_index=batch_index,
+                                 request_params=request_params,
+                                 step_idx=step_idx)
+        if ref_all_stop_token_ids != all_stop_token_ids:
+            _raise_error_invalid(msg_suffix=(
+                "Stop token ids do not match reference; all_stop_token_ids: "
+                f"{sorted(all_stop_token_ids)}, ref_all_stop_token_ids: "
+                f"{sorted(ref_all_stop_token_ids)}"),
+                                 batch_index=batch_index,
+                                 request_params=request_params,
+                                 step_idx=step_idx)
+        if min_reached:
+            _raise_error_invalid(msg_suffix=(
+                "Expected min-tokens request with min reached, but batch "
+                "index is recognized by min-tokens logits processor."),
+                                 batch_index=batch_index,
+                                 request_params=request_params,
+                                 step_idx=step_idx,
+                                 err_cls=RuntimeError)
+
+    elif not min_reached:
+        _raise_error_invalid(msg_suffix=(
+            "Expected min-tokens request with min not reached, but batch "
+            "index is not recognized by min-tokens logits processor."),
+                             batch_index=batch_index,
+                             request_params=request_params,
+                             step_idx=step_idx,
+                             err_cls=RuntimeError)
+
+    # Validate min-token logits
+    for token_id in range(VOCAB_SIZE):
+        logits_for_token = logits_new[batch_index][token_id]
+        if token_id in ref_all_stop_token_ids and not min_reached:
+            if logits_for_token != -float("inf"):
+                _raise_error_invalid(
+                    msg_suffix=(f"Token {token_id} is a stop token and "
+                                "the sequence has not reached min length, "
+                                "but the token is not masked "
+                                f"(logit={logits_for_token})"),
+                    batch_index=batch_index,
+                    request_params=request_params,
+                    step_idx=step_idx)
+        else:
+            if logits_for_token == -float("inf"):
+                _raise_error_invalid(
+                    msg_suffix=(f"Token {token_id} should not be masked but "
+                                f"is (output len={ref_num_out_tokens})"),
+                    batch_index=batch_index,
+                    request_params=request_params,
+                    step_idx=step_idx)
+
+
+def _none_validate(
+    test_fakes: LogitsprocsTestFakes,
+    persistent_batch: list[LogitsProcsRequestParams],
+    logits_new: torch.Tensor,
+    batch_index: int,
+    request_params: LogitsProcsRequestParams,
+    step_idx: int,
+) -> None:
+    """Validate that no logits processors are applied"""
+    logits = (
+        test_fakes.logits[persistent_batch[batch_index].workload_index].cpu())
+    ref_logits = logits_new[batch_index]
+    if not torch.all(ref_logits == logits):
+        mismatch_toks = (ref_logits
+                         != logits).nonzero(as_tuple=True)[0].tolist()
+        mismatch_strs = []
+        for token in mismatch_toks:
+            val = float(logits[token])
+            ref_val = float(ref_logits[token])
+            mismatch_strs.append(f"({token=},{val=},{ref_val=})")
+        _raise_error_invalid(msg_suffix=(
+            f"Unexpected modification of logits: {','.join(mismatch_strs)}"),
+                             batch_index=batch_index,
+                             request_params=request_params,
+                             step_idx=step_idx)
+
+
+class LogitsprocTestHelpers(NamedTuple):
+    """Supports setting up and validating logitsprocs unit tests."""
+    eval_fxn: Callable
+    gen_request_fxn: Optional[Callable] = None
+
+
+logitsprocs_test_mapping = {
+    STR_NO_LOGITPROC:
+    LogitsprocTestHelpers(eval_fxn=_none_validate),
+    LogitBiasLogitsProcessor:
+    LogitsprocTestHelpers(gen_request_fxn=_logit_bias_params,
+                          eval_fxn=_logit_bias_validate),
+    MinPLogitsProcessor:
+    LogitsprocTestHelpers(gen_request_fxn=_min_p_params,
+                          eval_fxn=_min_p_validate),
+    MinTokensLogitsProcessor:
+    LogitsprocTestHelpers(gen_request_fxn=_min_tokens_params,
+                          eval_fxn=_min_tokens_validate),
+}
+
+
+def _get_test_cases() -> list[list[str]]:
+    """Each test case is a set of logitsprocs"""
+    logitsprocs_types = list(logitsprocs_test_mapping.keys())
+    return [[STR_NO_LOGITPROC]] + [[logitproc_type, STR_NO_LOGITPROC]
+                                   for logitproc_type in logitsprocs_types
+                                   if logitproc_type != STR_NO_LOGITPROC
+                                   ] + [logitsprocs_types]
+
+
+def _generate_fake_step_update(
+    persistent_batch: list[LogitsProcsRequestParams],
+    workload_params: list[LogitsProcsRequestParams],
+    wdx: int,
+    batch_update_builder: BatchUpdateBuilder,
+) -> tuple[Optional[BatchUpdate], int, int]:
+    batch_size = len(persistent_batch)
+    workload_size = len(workload_params)
+    workload_reqs_remaining = workload_size - wdx
+    max_add_remove_per_step = max(1, int(0.2 * workload_size))
+
+    # 50% of steps: add no reqs
+    # Other 50%: add a limited number of reqs (less than the number
+    # of workload reqs remaining, less than an arbitrary max)
+    # If no workload reqs remain: 100% of steps have 0 adds
+    num_step_add = random.choice([
+        0,
+        random.randint(1, min(max_add_remove_per_step,
+                              workload_reqs_remaining))
+    ]) if workload_reqs_remaining else 0
+
+    # 50% of steps: remove no requests
+    # Other 50%: remove a limited number of reqs (less than the number
+    # persistent batch reqs remaining, less than an arbitrary max)
+    # If persistent batch is empty: 100% of steps have 0 removals until
+    # more requests are added. Assume that removed requests are always
+    # drawn from the current batch, before new adds
+    num_step_remove = random.choice([
+        0, random.randint(1, min(max_add_remove_per_step, batch_size))
+    ]) if batch_size else 0
+
+    num_step_add_replace = min(num_step_add, num_step_remove)
+
+    # Generate fake removed request indices drawn from persistent batch indices
+    for removal in random.sample(range(batch_size), num_step_remove):
+        batch_update_builder.removed_append(removal)
+
+    # Get added requests from workload
+    for add_req_params in workload_params[wdx:(wdx + num_step_add_replace)]:
+        # Replace as many removed requests as possible with added requests
+        add_remove_idx = batch_update_builder.pop_removed()
+        batch_update_builder.added.append(
+            (add_remove_idx, add_req_params.params, add_req_params.out_tokens))
+        persistent_batch[add_remove_idx] = add_req_params
+
+    # Append remaining added requests to end of batch
+    add_reqs_append = workload_params[(wdx +
+                                       num_step_add_replace):(wdx +
+                                                              num_step_add)]
+    batch_update_builder.added.extend([
+        (adx + batch_size, add_req_params.params, add_req_params.out_tokens)
+        for adx, add_req_params in enumerate(add_reqs_append)
+    ])
+    persistent_batch.extend(add_reqs_append)
+    pre_condense_batch_size = len(persistent_batch)
+    wdx += num_step_add  # Update workload offset
+
+    # Simulate condensing persistent batch
+    last_nonempty_index = pre_condense_batch_size - 1
+    condensed_to_idxs = set()
+    while batch_update_builder.removed:
+        if (last_nonempty_index in batch_update_builder.removed
+                or last_nonempty_index in condensed_to_idxs):
+            last_nonempty_index -= 1
+            continue
+        # last_nonempty_index is the highest persistent batch index that was
+        # not removed
+        first_empty_index = batch_update_builder.peek_removed()
+        assert first_empty_index is not None
+        if first_empty_index > last_nonempty_index:
+            break
+        # first_empty_index is the lowest removed persistent batch index
+        # that is less than last_nonempty_index
+        #
+        # move last_nonempty_index -> first_empty_index
+        batch_update_builder.pop_removed()
+        condensed_to_idxs.add(first_empty_index)
+        persistent_batch[first_empty_index] = persistent_batch[
+            last_nonempty_index]
+        batch_update_builder.moved.append(
+            (last_nonempty_index, first_empty_index,
+             MoveDirectionality.UNIDIRECTIONAL))
+
+        last_nonempty_index -= 1
+
+    # Now removed requests & gaps left by non-removed requests that got
+    # moved downward are grouped consecutively in the upper indices of
+    # the persistent batch. Truncate them to get condensed persistent batch
+    condensed_batch_size = batch_size + num_step_add - num_step_remove
+    persistent_batch[:] = persistent_batch[0:condensed_batch_size]
+
+    if condensed_batch_size > 1:
+        # Simulate arbitrary reorder_batch() in the kernel backend
+        # Generate a random number k of non-overlapping swap tuples
+        k = random.randint(0, condensed_batch_size // 2)
+        idxs = list(range(condensed_batch_size))
+        random.shuffle(idxs)
+        swaps = [
+            tuple(sorted([idxs[2 * i], idxs[2 * i + 1]])) for i in range(k)
+        ]
+        batch_update_builder.moved.extend([
+            (sw[0], sw[1], MoveDirectionality.SWAP) for sw in swaps
+        ])
+        for adx, bdx in swaps:
+            persistent_batch[adx], persistent_batch[bdx] = persistent_batch[
+                bdx], persistent_batch[adx]
+
+    return (batch_update_builder.get_and_reset(condensed_batch_size), wdx,
+            workload_size - wdx)
+
+
+def _assert_valid(
+    batch_size: int,
+    persistent_batch: list[LogitsProcsRequestParams],
+    test_fakes: LogitsprocsTestFakes,
+    slice_idxs: list[int],
+    logits_w_lp: torch.Tensor,
+    step_idx: int,
+) -> None:
+    if not slice_idxs:
+        # Trivial case of empty persistent batch
+        assert len(persistent_batch) == 0
+        if logits_w_lp.shape[0] != 0:
+            raise ValueError("Fake persistent batch is empty but logitsprocs "
+                             f"output batch has shape {logits_w_lp.shape}")
+        return
+
+    # Validate logits for each fake request
+    for batch_index in range(batch_size):
+        request_params = persistent_batch[batch_index]
+        # Invoke the appropriate validation function for
+        # the logitproc employed by this request
+        fxn = logitsprocs_test_mapping[request_params.logitproc_type].eval_fxn
+        fxn(test_fakes=test_fakes,
+            persistent_batch=persistent_batch,
+            logits_new=logits_w_lp,
+            batch_index=batch_index,
+            request_params=request_params,
+            step_idx=step_idx)
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("reqs_per_logitproc", [REQS_PER_LOGITPROC])
+@pytest.mark.parametrize("logitsprocs_under_test", _get_test_cases())
+def test_logitsprocs(device: str, reqs_per_logitproc: int,
+                     logitsprocs_under_test: list[str]):
+    random.seed(40)
+    torch.set_default_device(device)
+
+    # Define a shuffled batch of requests which individually use a different
+    # logitproc, or no logitproc at all
+    workload_params = _generate_mixed_logitsprocs_batch_params(
+        reqs_per_logitproc=reqs_per_logitproc,
+        logitsprocs_types=logitsprocs_under_test)
+    workload_size = len(workload_params)
+
+    # Create fake test data structures for testing.
+    test_fakes = _generate_test_fakes(workload_size, device)
+
+    wdx = 0  # Next request index in workload to add
+    persistent_batch: list[LogitsProcsRequestParams] = [
+    ]  # Persistent batch state, as list of workload indices
+
+    # Generate fake removed request indices from current persistent
+    # batch before adds
+    batch_update_builder = BatchUpdateBuilder()
+
+    # Break when entire workload has been added previously and persistent
+    # batch is empty
+    workload_reqs_remaining = workload_size
+    batch_size = 0
+    step_idx = 0
+    while True:
+        if not (workload_reqs_remaining or batch_size):
+            break
+
+        (
+            batch_update,
+            wdx,
+            workload_reqs_remaining,
+        ) = _generate_fake_step_update(
+            persistent_batch=persistent_batch,
+            workload_params=workload_params,
+            wdx=wdx,
+            batch_update_builder=batch_update_builder,
+        )
+        batch_size = len(persistent_batch)
+
+        # Apply fake batch update to logitsprocs
+        fake_update_logitsprocs_state(test_fakes, batch_update)
+
+        # Emulate application of logits processors in engine
+        slice_idxs = [req.workload_index for req in persistent_batch]
+        logits_w_lp = fake_apply_logitsprocs(test_fakes, slice_idxs).cpu()
+
+        _assert_valid(
+            batch_size=batch_size,
+            persistent_batch=persistent_batch,
+            test_fakes=test_fakes,
+            slice_idxs=slice_idxs,
+            logits_w_lp=logits_w_lp,
+            step_idx=step_idx,
+        )
+
+        step_idx += 1
diff --git a/tests/v1/sample/test_logprobs_e2e.py b/tests/v1/sample/test_logprobs_e2e.py
index 0b135613ff6bd..50b14a15dc164 100644
--- a/tests/v1/sample/test_logprobs_e2e.py
+++ b/tests/v1/sample/test_logprobs_e2e.py
@@ -13,9 +13,10 @@ EXPECTED_VALUE = 0.62
 
 # FIXME(rob): enable prefix caching once supported.
 MODEL = "meta-llama/Llama-3.2-1B-Instruct"
-MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False"  # noqa: E501
+MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False,gpu_memory_utilization=0.8"  # noqa: E501
 SERVER_ARGS = [
-    "--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests"
+    "--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests",
+    "--gpu-memory-utilization=0.8"
 ]
 NUM_CONCURRENT = 100
 
diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index 1f2bdb3c5ff62..3a4d48afc9d77 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -7,6 +7,7 @@ import torch
 import torch.nn.functional as F
 
 from vllm.platforms import current_platform
+from vllm.v1.sample.logits_processor import LogitsProcessorManager
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import (PLACEHOLDER_TOKEN_ID,
                                               RejectionSampler)
@@ -58,7 +59,6 @@ def create_sampling_metadata(
         all_random=not all_greedy,
         top_p=top_p,
         top_k=top_k,
-        min_p=torch.empty(1, ),
         generators=generators,
         max_num_logprobs=0,
         no_penalties=False,
@@ -67,10 +67,9 @@ def create_sampling_metadata(
         presence_penalties=torch.tensor([]),
         repetition_penalties=torch.tensor([]),
         output_token_ids=[],
-        min_tokens={},
-        logit_bias=[None],
         allowed_token_ids_mask=None,
         bad_words_token_ids={},
+        logitsprocs=LogitsProcessorManager(),
     )
 
 
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index a2beb5ad71dbb..ea10661ea1137 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -8,10 +8,13 @@ import pytest
 import torch
 
 from vllm.platforms import current_platform
-from vllm.utils import make_tensor_with_pad
+from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+from vllm.v1.sample.logits_processor import LogitsProcessorManager
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.sampler import Sampler
 
+PIN_MEMORY_AVAILABLE = is_pin_memory_available()
+MAX_NUM_REQS = 256
 VOCAB_SIZE = 1024
 NUM_OUTPUT_TOKENS = 20
 CUDA_DEVICES = [
@@ -48,18 +51,6 @@ def _create_prompt_tokens_tensor(
     )
 
 
-def _create_logit_bias(
-    batch_size: int,
-    vocab_size: int,
-    bias_value: float,
-) -> list[Optional[dict[int, float]]]:
-    res: list[Optional[dict[int, float]]] = []
-    for i in range(batch_size):
-        logit_bias = {min(i, vocab_size - 1): bias_value}
-        res.append(logit_bias)
-    return res
-
-
 def _create_allowed_token_ids(
     batch_size: int,
     vocab_size: int,
@@ -145,7 +136,6 @@ def _create_default_sampling_metadata(
         all_random=False,
         top_p=None,
         top_k=None,
-        min_p=None,
         generators={},
         max_num_logprobs=0,
         prompt_token_ids=_create_prompt_tokens_tensor(prompt_token_ids,
@@ -155,43 +145,13 @@ def _create_default_sampling_metadata(
         presence_penalties=_create_penalty_tensor(batch_size, 0.0, device),
         repetition_penalties=_create_penalty_tensor(batch_size, 1.0, device),
         no_penalties=True,
-        min_tokens={},
-        logit_bias=[None] * batch_size,
         allowed_token_ids_mask=None,
         bad_words_token_ids={},
+        logitsprocs=LogitsProcessorManager(),
     )
     return fake_sampling_metadata
 
 
-def _generate_min_token_penalties_and_stop_tokens(
-    num_output_tokens: int, batch_size: int, vocab_size: int,
-    batch_indices_for_min_token_penalty: list[int]
-) -> dict[int, tuple[int, set[int]]]:
-    """
-    Generates and returns a dict of minimum token penalties and
-    corresponding stop token IDs (`min_tokens`, `stop_token_ids`) for each
-    batch.
-
-    If a batch index is included in `batch_indices_for_min_token_penalty`,
-    a higher `min_tokens` value is assigned (within a randomized range),
-    and a random set of stop token IDs is created. Otherwise, a lower
-    `min_tokens` value is assigned, and the stop token IDs set is empty.
-    """
-    min_tokens: dict[int, tuple[int, set[int]]] = {}
-    for index in range(batch_size):
-        if index in batch_indices_for_min_token_penalty:
-            min_tokens[index] = (
-                np.random.randint(num_output_tokens + 1,
-                                  2 * num_output_tokens),
-                set(
-                    np.random.randint(0, vocab_size - 1)
-                    for _ in range(np.random.randint(0, vocab_size))))
-        else:
-            min_tokens[index] = (np.random.randint(0,
-                                                   num_output_tokens), set())
-    return min_tokens
-
-
 def _create_weighted_output_token_list(
         batch_size: int,
         vocab_size: int) -> tuple[list[list[int]], list[list[int]]]:
@@ -227,36 +187,6 @@ def _create_weighted_output_token_list(
     return output_token_ids, sorted_token_ids_in_output
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("batch_size", [1, 2, 32])
-def test_sampler_min_tokens_penalty(device: str, batch_size: int):
-    """
-    Tests that if the number of output tokens is less than
-    SamplingParams.min_tokens then we will set the logits for
-    the stop token ids to -inf.
-    """
-    torch.set_default_device(device)
-    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
-    sampling_metadata = _create_default_sampling_metadata(
-        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
-    batch_indices_for_min_token_penalty = np.random.randint(
-        0, batch_size - 1, size=np.random.randint(0, batch_size)).tolist()
-    min_tokens = _generate_min_token_penalties_and_stop_tokens(
-        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE,
-        batch_indices_for_min_token_penalty)
-    sampling_metadata.min_tokens = min_tokens
-    sampler = Sampler()
-    logits = sampler.apply_penalties(fake_logits, sampling_metadata)
-    logits = logits.cpu()
-    for batch_idx in range(batch_size):
-        for token_id in range(VOCAB_SIZE):
-            _, stop_token_ids = min_tokens.get(batch_idx, (0, set()))
-            if token_id in stop_token_ids:
-                assert logits[batch_idx][token_id] == -float("inf")
-            else:
-                assert logits[batch_idx][token_id] != -float("inf")
-
-
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("presence_penalty", [-2.0, 2.0])
@@ -401,80 +331,6 @@ def test_sampler_repetition_penalty(device: str, batch_size: int,
                     or non_penalized_token_id in output_tokens)
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("batch_size", [1, 2, 32])
-@pytest.mark.parametrize("min_p", [0.0, 0.1])
-def test_sampler_min_p(device: str, batch_size: int, min_p: float):
-    """
-    Tests that when min_p is applied, tokens with probability below 
-    min_p * max_prob are masked with -inf.
-    """
-    torch.set_default_device(device)
-    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
-
-    # Create one dominant token per batch
-    for i in range(batch_size):
-        fake_logits[i, 0] = 10.0  # High logit for first token
-        fake_logits[i, 1:] = 1e-2  # Others remain low
-
-    sampling_metadata = _create_default_sampling_metadata(
-        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
-
-    # Configure min_p parameters
-    sampling_metadata.min_p = torch.full((batch_size, ), min_p, device=device)
-
-    sampler = Sampler()
-    logits = sampler.apply_min_p(fake_logits, sampling_metadata.min_p)
-    logits = logits.cpu()
-
-    for batch_idx in range(batch_size):
-        for token_id in range(VOCAB_SIZE):
-            if token_id == 0:
-                # Dominant token should always be unmasked
-                assert logits[batch_idx][token_id] != -float("inf")
-            else:
-                if min_p > 0.0:
-                    # Non-dominant tokens should be masked when min_p > 0
-                    assert logits[batch_idx][token_id] == -float("inf")
-                else:
-                    # No masking when min_p is 0
-                    assert logits[batch_idx][token_id] != -float("inf")
-
-
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("batch_size", [1, 2, 32])
-@pytest.mark.parametrize("bias_value", [-0.1, 1.2])
-def test_sampler_logit_bias(device: str, batch_size: int, bias_value: float):
-    """
-    Test to verify that when the repetition penalty is enabled, tokens
-    are penalized based on their presence in the prompt or the existing
-    output.
-    """
-    torch.set_default_device(device)
-    # Create fake logits where each token is assigned the same
-    # logit value.
-    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
-    sampling_metadata = _create_default_sampling_metadata(
-        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
-    sampling_metadata.logit_bias = _create_logit_bias(
-        batch_size=batch_size,
-        vocab_size=VOCAB_SIZE,
-        bias_value=bias_value,
-    )
-    sampler = Sampler()
-    logits = sampler.apply_logits_bias(fake_logits, sampling_metadata)
-    logits = logits.cpu()
-    for batch_idx in range(batch_size):
-        logits_for_req = logits[batch_idx]
-        biased_index = min(batch_idx, VOCAB_SIZE - 1)
-        for token_id in range(VOCAB_SIZE):
-            if biased_index == token_id:
-                assert logits_for_req[token_id] == pytest.approx(bias_value +
-                                                                 1e-2)
-            else:
-                assert logits_for_req[token_id] == pytest.approx(1e-2)
-
-
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("num_allowed_token_ids", [0, 1, 2])
diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py
index 8c111f846b47e..e33efb413d026 100644
--- a/tests/v1/sample/utils.py
+++ b/tests/v1/sample/utils.py
@@ -1,12 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from collections.abc import Iterator
 from enum import Enum
-from typing import Optional
+from typing import NamedTuple, Optional
 
 import regex as re
+import torch
 
 from vllm import CompletionOutput
+from vllm.utils import make_tensor_with_pad
+from vllm.v1.sample.logits_processor import BatchUpdate, LogitsProcessor
+from vllm.v1.sample.metadata import SamplingMetadata
 
 
 class BatchLogprobsComposition(Enum):
@@ -134,3 +139,77 @@ def compute_correct_cumulative_logprob(
     logprobs = completion_output.logprobs
     assert logprobs is not None
     return sum([lp[tok_id].logprob for tok_id, lp in zip(token_ids, logprobs)])
+
+
+def create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor:
+    fake_logits = torch.full((batch_size, vocab_size), 1e-2, dtype=torch.float)
+    return fake_logits
+
+
+def create_penalty_tensor(batch_size: int, penalty_value: float,
+                          device: torch.device) -> torch.Tensor:
+    return torch.full((batch_size, ),
+                      fill_value=penalty_value,
+                      dtype=torch.float,
+                      device=device)
+
+
+def create_prompt_tokens_tensor(
+    prompt_token_ids: list[list[int]],
+    vocab_size: int,
+    device: torch.device,
+) -> torch.Tensor:
+    return make_tensor_with_pad(
+        prompt_token_ids,
+        pad=vocab_size,
+        device=device,
+        dtype=torch.int64,
+        pin_memory=False,
+    )
+
+
+class LogitsprocsTestFakes(NamedTuple):
+    """Wraps fake data structures to support testing"""
+    logits: torch.Tensor
+    sampling_metadata: SamplingMetadata
+
+    def get_logitsprocs_by_cls(
+        self,
+        cls: type[LogitsProcessor],
+    ) -> Iterator[LogitsProcessor]:
+        """Yield logits processors of a specific class.
+        
+        Args:
+          cls: :class:`LogitsProcessor` subclass
+
+        Returns:
+          Iterator over logits processors
+        """
+        return (lp for lp in self.sampling_metadata.logitsprocs.all
+                if isinstance(lp, cls))
+
+    def get_logitsprocs(self) -> Iterator[LogitsProcessor]:
+        """Iterator over all logits processors."""
+        return self.sampling_metadata.logitsprocs.all
+
+
+def fake_update_logitsprocs_state(
+    test_fakes: LogitsprocsTestFakes,
+    batch_update: BatchUpdate,
+) -> None:
+    """Imitate logits processors persistent batch state update
+    in engine core"""
+    for logitproc in test_fakes.get_logitsprocs():
+        logitproc.update_state(batch_update)
+
+
+def fake_apply_logitsprocs(
+    test_fakes: LogitsprocsTestFakes,
+    slice_indices: list[int],
+) -> torch.Tensor:
+    """Imitate application of logits processors in engine core"""
+    logits = test_fakes.logits[torch.tensor(slice_indices,
+                                            dtype=torch.long)].clone()
+    for processor in test_fakes.get_logitsprocs():
+        logits = processor.apply(logits)
+    return logits
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 59b28e675c252..943a13debada2 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import inspect
+from collections.abc import Sequence
 from typing import Optional
 
 import numpy as np
@@ -12,6 +13,7 @@ from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
 from vllm.v1.pool.metadata import PoolingMetadata
+from vllm.v1.sample.logits_processor import LogitsProcessorManager
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -26,13 +28,18 @@ CUDA_DEVICES = [
 MAX_NUM_PROMPT_TOKENS = 64
 
 
-def _compare_objs(obj1, obj2):
+def _compare_objs(obj1,
+                  obj2,
+                  skip: Sequence = ("logitsprocs", "batch_update_builder")):
     attrs = inspect.getmembers(obj1, lambda a: not (inspect.isroutine(a)))
     attr_names = set([
         a[0] for a in attrs
         if not (a[0].startswith('__') and a[0].endswith('__'))
     ])
     for attr_name in attr_names:
+        if attr_name in skip:
+            continue
+
         a = getattr(obj1, attr_name)
         b = getattr(obj2, attr_name)
 
@@ -58,13 +65,11 @@ def _compare_objs(obj1, obj2):
             f" in {obj1} and {obj2}: {a} != {b}"
 
 
-def _remove_requests(
-        input_batch: InputBatch, batch_size: int,
-        reqs: list[CachedRequestState]) -> tuple[set[str], list[int]]:
+def _remove_requests(input_batch: InputBatch, batch_size: int,
+                     reqs: list[CachedRequestState]) -> set[str]:
     """
-    Remove some requests randomly from the batch and returns a tuple
-    of 1) set of request removed 2) indices of the requests removed
-    ordered in descending order
+    Remove some requests randomly from the batch and returns
+    set of request removed
     """
 
     num_reqs_to_remove = np.random.randint(0, batch_size)
@@ -73,13 +78,11 @@ def _remove_requests(
         req_index_to_remove = np.random.randint(0, batch_size)
         req_indices_to_remove.add(req_index_to_remove)
 
-    req_indices_to_remove_list = list(req_indices_to_remove)
-    req_indices_to_remove_list.sort(reverse=True)
     req_ids_to_remove: set[str] = set()
     for index in req_indices_to_remove:
         input_batch.remove_request(reqs[index].req_id)
         req_ids_to_remove.add(reqs[index].req_id)
-    return req_ids_to_remove, req_indices_to_remove_list
+    return req_ids_to_remove
 
 
 def _construct_expected_sampling_metadata(
@@ -100,7 +103,6 @@ def _construct_expected_sampling_metadata(
     repetition_penalties = [1.0 for _ in range(num_reqs)]
     top_k = [0 for _ in range(num_reqs)]
     top_p = [0.0 for _ in range(num_reqs)]
-    min_p = [0.0 for _ in range(num_reqs)]
     temperature = [0.0 for _ in range(num_reqs)]
     min_tokens = {}
     logit_bias = [None] * num_reqs
@@ -123,7 +125,6 @@ def _construct_expected_sampling_metadata(
             req.sampling_params.repetition_penalty)
         top_k[index_in_input_batch] = req.sampling_params.top_k
         top_p[index_in_input_batch] = req.sampling_params.top_p
-        min_p[index_in_input_batch] = req.sampling_params.min_p
         temperature[index_in_input_batch] = req.sampling_params.temperature
         min_tokens[index_in_input_batch] = (
             req.sampling_params.min_tokens,
@@ -145,8 +146,6 @@ def _construct_expected_sampling_metadata(
             top_p, dtype=torch.float, device=device),
         top_k=None if all(x == 0 for x in top_k) else torch.tensor(
             top_k, dtype=torch.int, device=device),
-        min_p=None if all(x == 0.0 for x in min_p) else torch.tensor(
-            min_p, dtype=torch.float, device=device),
         generators={},
         max_num_logprobs=0,
         prompt_token_ids=make_tensor_with_pad(
@@ -165,13 +164,12 @@ def _construct_expected_sampling_metadata(
                                           dtype=torch.float,
                                           device=device),
         output_token_ids=output_token_ids,
-        min_tokens=min_tokens,
         no_penalties=(all(x == 0 for x in presence_penalties)
                       and all(x == 0 for x in frequency_penalties)
                       and all(x == 1 for x in repetition_penalties)),
-        logit_bias=logit_bias,
         allowed_token_ids_mask=allowed_token_ids_mask,
         bad_words_token_ids=bad_words_token_ids,
+        logitsprocs=LogitsProcessorManager(),
     )
 
 
@@ -225,6 +223,8 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
     and the `make_sampling_metadata` method is invoked on the batch. The
     output of `make_sampling_metadata` is then compared against the expected
     results to ensure correctness.
+
+    Note: Ignore logits processor logic, which is tested separately
     """
     input_batch: InputBatch = InputBatch(
         max_num_reqs=batch_size,
@@ -238,21 +238,22 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
     reqs: list[CachedRequestState] = []
     req_id_reqs = {}
     req_id_output_token_ids = {}
+
     # Add requests
     for req_index in range(batch_size):
         req: CachedRequestState = _construct_cached_request_state(req_index)
-        input_batch.add_request(req, req_index)
+        assigned_req_index = input_batch.add_request(req)
+        assert req_index == assigned_req_index
         reqs.append(req)
         req_id_reqs[req.req_id] = req
         req_id_output_token_ids[req.req_id] = req.output_token_ids
 
     # Remove some requests
-    req_ids_to_remove, req_indices_to_remove = _remove_requests(
-        input_batch, batch_size, reqs)
+    req_ids_to_remove = _remove_requests(input_batch, batch_size, reqs)
     req_ids_retained = set(req_id_reqs.keys()) - req_ids_to_remove
 
     # Compact the input batch
-    input_batch.condense(req_indices_to_remove)
+    input_batch.condense()
 
     # Generate the sampling metadata
     sampling_metadata = input_batch._make_sampling_metadata()
@@ -290,10 +291,8 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
                           sampling_metadata.prompt_token_ids)
     assert (expected_sampling_metadata.output_token_ids ==
             sampling_metadata.output_token_ids)
-    assert expected_sampling_metadata.min_tokens == sampling_metadata.min_tokens
     assert expected_sampling_metadata.no_penalties == \
            sampling_metadata.no_penalties
-    assert expected_sampling_metadata.logit_bias == sampling_metadata.logit_bias
     if sampling_metadata.allowed_token_ids_mask:
         assert torch.allclose(
             expected_sampling_metadata.allowed_token_ids_mask,
@@ -315,6 +314,8 @@ def test_swap_states_in_input_batch(device: str, batch_size: int,
     and the `make_sampling_metadata` method is invoked on the batch. The
     output of `make_sampling_metadata` is then compared against the expected
     results to ensure correctness.
+
+    Note: Ignore logits processor logic, which is tested separately
     """
     input_batch: InputBatch = InputBatch(
         max_num_reqs=batch_size,
@@ -341,7 +342,8 @@ def test_swap_states_in_input_batch(device: str, batch_size: int,
     # Add requests
     for req_index in range(batch_size):
         req: CachedRequestState = _construct_cached_request_state(req_index)
-        input_batch.add_request(req, req_index)
+        assigned_req_index = input_batch.add_request(req)
+        assert assigned_req_index == req_index
         reqs.append(req)
         req_id_reqs[req.req_id] = req
         req_id_output_token_ids[req.req_id] = req.output_token_ids
@@ -354,9 +356,10 @@ def test_swap_states_in_input_batch(device: str, batch_size: int,
 
     for req_index in range(batch_size):
         req = reordered_reqs[req_index]
-        ref_input_batch.add_request(req, req_index)
+        assigned_req_index = ref_input_batch.add_request(req)
+        assert assigned_req_index == req_index
 
-    input_batch.refresh_sampling_metadata()
-    ref_input_batch.refresh_sampling_metadata()
+    input_batch.refresh_metadata()
+    ref_input_batch.refresh_metadata()
 
     _compare_objs(input_batch, ref_input_batch)
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
new file mode 100644
index 0000000000000..9aa560d30eee6
--- /dev/null
+++ b/vllm/v1/sample/logits_processor.py
@@ -0,0 +1,516 @@
+# SPDX-License-Identifier: Apache-2.0
+import dataclasses
+from abc import ABC, abstractmethod
+from collections.abc import Iterator, Sequence
+from dataclasses import dataclass, field
+from enum import Enum
+from itertools import chain
+from typing import Optional, Union
+
+import torch
+from torch._prims_common import DeviceLikeType
+
+from vllm import PoolingParams, SamplingParams
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class MoveDirectionality(Enum):
+    # One-way i1->i2 req move within batch
+    UNIDIRECTIONAL = 0
+    # Two-way i1<->i2 req swap within batch
+    SWAP = 1
+
+
+# (index, params, output_tok_ids) tuples for new
+# requests added to the batch.
+AddedRequest = tuple[int, Union[SamplingParams, PoolingParams], list[int]]
+# (index 1, index 2, directionality) tuples representing
+# one-way moves or two-way swaps of requests in batch
+MovedRequest = tuple[int, int, MoveDirectionality]
+# Batch indices of any removed requests.
+RemovedRequest = int
+
+
+@dataclasses.dataclass(frozen=True)
+class BatchUpdate:
+    """Persistent batch state change info for logitsprocs"""
+    batch_size: int  # Current num reqs in batch
+
+    # Metadata for requests added to, removed from, and moved
+    # within the persistent batch.
+    #
+    # Note: each added request is represented as
+    # (index, params, output_tok_ids)
+    # Key assumption: output_tok_ids is a reference to the
+    # request's running output tokens list; in this way
+    # the logits processors always see the latest list of
+    # generated tokens
+    removed: Sequence[RemovedRequest]
+    moved: Sequence[MovedRequest]
+    added: Sequence[AddedRequest]
+
+
+class BatchUpdateBuilder:
+    """Helps track persistent batch state changes and build
+    a batch update data structure for logitsprocs
+    
+    Assumptions:
+    * All information about requests removed from persistent batch
+      during a step is aggregated in self._removed through calls to
+      self.removed_append() at the beginning of a step. This must happen
+      before the first time that self.removed, self.pop_removed()
+      or self.peek_removed() are invoked in a given step
+    * After the first time that self.removed, self.pop_removed()
+      or self.peek_removed() are read in a step, no new removals
+      are registered using self.removed_append()
+    * Elements of self._removed are never directly modified, added or
+      removed (i.e. modification is only via self.removed_append() and
+      self.pop_removed())
+    
+    Guarantees under above assumptions:
+    * self.removed is always sorted in descending order
+    * self.pop_removed() and self.peek_removed() both return
+      the lowest removed request index in the current step
+    """
+
+    _removed: list[RemovedRequest]
+    _is_removed_sorted: bool
+    moved: list[MovedRequest]
+    added: list[AddedRequest]
+
+    def __init__(
+        self,
+        removed: Optional[list[RemovedRequest]] = None,
+        moved: Optional[list[MovedRequest]] = None,
+        added: Optional[list[AddedRequest]] = None,
+    ) -> None:
+        self._removed = removed or []
+        self.moved = moved or []
+        self.added = added or []
+        self._is_removed_sorted = False
+
+    def _ensure_removed_sorted(self) -> None:
+        """Sort removed request indices in
+        descending order.
+        
+        Idempotent after first call in a
+        given step, until reset.
+        """
+        if not self._is_removed_sorted:
+            self._removed.sort(reverse=True)
+            self._is_removed_sorted = True
+
+    @property
+    def removed(self) -> list[RemovedRequest]:
+        """Removed request indices sorted in
+        descending order"""
+        self._ensure_removed_sorted()
+        return self._removed
+
+    def removed_append(self, index: int) -> None:
+        """Register the removal of a request from
+        the persistent batch.
+
+        Must not be called after the first time
+        self.removed, self.pop_removed() or
+        self.peek_removed() are invoked.
+        
+        Args:
+          index: request index
+        """
+        if self._is_removed_sorted:
+            raise RuntimeError("Cannot register new removed request after"
+                               " self.removed has been read.")
+        self._removed.append(index)
+
+    def has_removed(self) -> bool:
+        return bool(self._removed)
+
+    def peek_removed(self) -> Optional[int]:
+        """Return lowest removed request index"""
+        if self.has_removed():
+            self._ensure_removed_sorted()
+            return self._removed[-1]
+        return None
+
+    def pop_removed(self) -> Optional[int]:
+        """Pop lowest removed request index"""
+        if self.has_removed():
+            self._ensure_removed_sorted()
+            return self._removed.pop()
+        return None
+
+    def get_and_reset(self, batch_size: int) -> Optional[BatchUpdate]:
+        """Generate a logitsprocs batch update data structure
+        and reset internal batch update builder state.
+        
+        Args:
+          batch_size: current persistent batch size
+
+        Returns:
+          Frozen logitsprocs batch update instance; `None` if no updates
+        """
+        # Reset removal-sorting logic
+        self._is_removed_sorted = False
+        if not any((self._removed, self.moved, self.added)):
+            # No update; short-circuit
+            return None
+        # Build batch state update
+        batch_update = BatchUpdate(
+            batch_size=batch_size,
+            removed=self._removed,
+            moved=self.moved,
+            added=self.added,
+        )
+        # Reset removed/moved/added update lists
+        self._removed = []
+        self.moved = []
+        self.added = []
+        return batch_update
+
+
+class LogitsProcessor(ABC):
+
+    @abstractmethod
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+    @abstractmethod
+    def is_argmax_invariant(self) -> bool:
+        """True if logits processor has no impact on the
+        argmax computation in greedy sampling.
+        NOTE: may or may not have the same value for all
+        instances of a given LogitsProcessor subclass,
+        depending on subclass implementation.
+        TODO(andy): won't be utilized until logits
+        processors are user-extensible
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def update_state(
+        self,
+        batch_update: Optional[BatchUpdate],
+    ) -> None:
+        """Called when there are new output tokens, prior
+        to each forward pass.
+
+        Args:
+            batch_update is non-None iff there have been
+            changes to the batch makeup.
+        """
+        raise NotImplementedError
+
+
+@dataclass
+class LogitsProcessorManager:
+    """Encapsulates initialized logitsproc objects."""
+    argmax_invariant: list[LogitsProcessor] = field(
+        default_factory=list)  # argmax-invariant logitsprocs
+    non_argmax_invariant: list[LogitsProcessor] = field(
+        default_factory=list)  # non-argmax-invariant logitsprocs
+
+    @property
+    def all(self) -> Iterator[LogitsProcessor]:
+        """Iterator over all logits processors."""
+        return chain(self.argmax_invariant, self.non_argmax_invariant)
+
+
+###### ----- Built-in LogitsProcessor impls below here
+
+
+class MinPLogitsProcessor(LogitsProcessor):
+
+    def __init__(self, max_num_reqs: int, pin_memory: bool,
+                 device: DeviceLikeType):
+        super().__init__()
+        self.min_p_count: int = 0
+
+        self.min_p_cpu_tensor = torch.zeros((max_num_reqs, ),
+                                            dtype=torch.float32,
+                                            device="cpu",
+                                            pin_memory=pin_memory)
+        self.min_p_cpu = self.min_p_cpu_tensor.numpy()
+        # Pre-allocated device tensor
+        self.min_p_device: torch.Tensor = torch.empty((max_num_reqs, ),
+                                                      dtype=torch.float32,
+                                                      device=device)
+        # Current slice of the device tensor
+        self.min_p: torch.Tensor = self.min_p_device[:0]
+
+    def is_argmax_invariant(self) -> bool:
+        """Min-p never impacts greedy sampling"""
+        return True
+
+    def get_min_p_by_index(self, index: int) -> float:
+        return float(self.min_p_cpu[index])
+
+    def update_state(self, batch_update: Optional[BatchUpdate]):
+        if not batch_update:
+            return
+
+        needs_update = False
+        # Process added requests.
+        for index, params, _ in batch_update.added:
+            min_p = params.min_p if isinstance(params, SamplingParams) else 0.0
+            if self.min_p_cpu[index] != min_p:
+                needs_update = True
+                self.min_p_cpu[index] = min_p
+            if min_p:
+                self.min_p_count += 1
+
+        if self.min_p_count:
+            # Process removed requests.
+            needs_update |= bool(batch_update.removed)
+            for index in batch_update.removed:
+                if self.min_p_cpu[index]:
+                    self.min_p_count -= 1
+
+            # Process moved requests, unidirectional (a->b) and swap (a<->b)
+            for adx, bdx, direct in batch_update.moved:
+                change = (min_p_a :=
+                          self.min_p_cpu[adx]) != (min_p_b :=
+                                                   self.min_p_cpu[bdx])
+                needs_update |= change
+                if change:
+                    self.min_p_cpu[bdx] = min_p_a
+                    if direct == MoveDirectionality.SWAP:
+                        self.min_p_cpu[adx] = min_p_b
+
+        # Update tensors if needed.
+        size = batch_update.batch_size
+        if self.min_p_count and (needs_update or self.min_p.shape[0] != size):
+            self.min_p = self.min_p_device[:size]
+            self.min_p.copy_(self.min_p_cpu_tensor[:size], non_blocking=True)
+            self.min_p.unsqueeze_(1)
+
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        if not self.min_p_count:
+            return logits
+
+        # Convert logits to probability distribution
+        probability_values = torch.nn.functional.softmax(logits, dim=-1)
+        # Calculate maximum probabilities per sequence
+        max_probabilities = torch.amax(probability_values,
+                                       dim=-1,
+                                       keepdim=True)
+        # Adjust min_p
+        adjusted_min_p = max_probabilities.mul_(self.min_p)
+        # Identify valid tokens using threshold comparison
+        invalid_token_mask = probability_values < adjusted_min_p
+        # Apply mask using boolean indexing
+        logits[invalid_token_mask] = -float('inf')
+        return logits
+
+
+class LogitBiasLogitsProcessor(LogitsProcessor):
+
+    def __init__(self, pin_memory: bool, device: torch.device):
+        super().__init__()
+        self.biases: dict[int, dict[int, float]] = {}
+        self.device = device
+        self.pin_memory = pin_memory
+
+        self.bias_tensor: torch.Tensor = torch.tensor(())
+        self.logits_slice = (self._device_tensor([], torch.int32),
+                             self._device_tensor([], torch.int32))
+
+    def is_argmax_invariant(self) -> bool:
+        """Logit bias can rebalance token probabilities and change the
+        outcome of argmax in greedy sampling."""
+        return False
+
+    def update_state(self, batch_update: Optional[BatchUpdate]):
+        if not batch_update:
+            return
+
+        # Process added requests.
+        needs_update = bool(batch_update.added)
+        for index, params, _ in batch_update.added:
+            if isinstance(params, SamplingParams) and (lb :=
+                                                       params.logit_bias):
+                self.biases[index] = lb
+            else:
+                self.biases.pop(index, None)
+
+        if self.biases:
+            # Process removed requests.
+            for index in batch_update.removed:
+                if self.biases.pop(index, None):
+                    needs_update = True
+
+            # Process moved requests, unidirectional (a->b) and swap (a<->b)
+            for a_index, b_index, direct in batch_update.moved:
+                if direct == MoveDirectionality.UNIDIRECTIONAL:
+                    if (a_entry := self.biases.pop(a_index, None)) is None:
+                        if self.biases.pop(b_index, None) is not None:
+                            needs_update = True
+                    else:
+                        self.biases[b_index] = a_entry
+                        needs_update = True
+                else:
+                    a_entry = self.biases.pop(a_index, None)
+                    if (b_entry := self.biases.pop(b_index, None)) is not None:
+                        self.biases[a_index] = b_entry
+                        needs_update = True
+                    if a_entry is not None:
+                        self.biases[b_index] = a_entry
+                        needs_update = True
+
+        # Update tensors if needed.
+        if needs_update:
+            reqs, tok_ids, biases = [], [], []
+            for req, lb in self.biases.items():
+                reqs.extend([req] * len(lb))
+                tok_ids.extend(lb.keys())
+                biases.extend(lb.values())
+
+            self.bias_tensor = self._device_tensor(biases, torch.float32)
+            self.logits_slice = (self._device_tensor(reqs, torch.int32),
+                                 self._device_tensor(tok_ids, torch.int32))
+
+    def _device_tensor(self, data: list, dtype: torch.dtype) -> torch.Tensor:
+        return (torch.tensor(data,
+                             device="cpu",
+                             dtype=dtype,
+                             pin_memory=self.pin_memory).to(device=self.device,
+                                                            non_blocking=True))
+
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        if self.biases:
+            logits[self.logits_slice] += self.bias_tensor
+        return logits
+
+
+class MinTokensLogitsProcessor(LogitsProcessor):
+
+    def __init__(self, pin_memory: bool, device: torch.device):
+        # index -> (min_toks, output_token_ids, stop_token_ids)
+        super().__init__()
+        self.min_toks: dict[int, tuple[int, Sequence[int], set[int]]] = {}
+        self.device = device
+        self.pin_memory = pin_memory
+
+        # (req_idx_tensor,eos_tok_id_tensor)
+        self.logits_slice: tuple[torch.Tensor,
+                                 torch.Tensor] = (self._device_tensor(
+                                     [], torch.int32),
+                                                  self._device_tensor(
+                                                      [], torch.int32))
+
+    def is_argmax_invariant(self) -> bool:
+        """By censoring stop tokens, min-tokens can change the outcome
+        of the argmax operation in greedy sampling."""
+        return False
+
+    def update_state(self, batch_update: Optional[BatchUpdate]):
+        needs_update = False
+
+        if batch_update:
+            # Process added requests.
+            needs_update |= bool(batch_update.added)
+            for index, params, output_tok_ids in batch_update.added:
+                if (isinstance(params, SamplingParams)
+                        and (min_tokens := params.min_tokens)
+                        and len(output_tok_ids) < min_tokens):
+                    # Replace request metadata at batch index
+                    self.min_toks[index] = (min_tokens, output_tok_ids,
+                                            params.all_stop_token_ids)
+                else:
+                    # Drop request metadata at batch index
+                    self.min_toks.pop(index, None)
+
+            if self.min_toks:
+                # Process removed requests.
+                for index in batch_update.removed:
+                    if self.min_toks.pop(index, None):
+                        needs_update = True
+
+                # Process moved requests, unidirectional (a->b) and
+                # swapped (a<->b)
+                for a_index, b_index, direct in batch_update.moved:
+                    if direct == MoveDirectionality.UNIDIRECTIONAL:
+                        if (a_entry := self.min_toks.pop(a_index,
+                                                         None)) is None:
+                            if self.min_toks.pop(b_index, None) is not None:
+                                needs_update = True
+                        else:
+                            self.min_toks[b_index] = a_entry
+                            needs_update = True
+                    else:
+                        a_entry = self.min_toks.pop(a_index, None)
+                        if (b_entry := self.min_toks.pop(b_index,
+                                                         None)) is not None:
+                            self.min_toks[a_index] = b_entry
+                            needs_update = True
+                        if a_entry is not None:
+                            self.min_toks[b_index] = a_entry
+                            needs_update = True
+
+        if self.min_toks:
+            # Check for any requests that have attained their min tokens.
+            to_remove = tuple(index for index, (min_toks, out_tok_ids,
+                                                _) in self.min_toks.items()
+                              if len(out_tok_ids) >= min_toks)
+            if to_remove:
+                needs_update = True
+                for index in to_remove:
+                    del self.min_toks[index]
+
+        # Update tensors if needed.
+        if needs_update:
+            reqs: list[int] = []
+            tok_ids: list[int] = []
+            for req, (_, _, stop_tok_ids) in self.min_toks.items():
+                reqs.extend([req] * len(stop_tok_ids))
+                tok_ids.extend(stop_tok_ids)
+
+            self.logits_slice = (self._device_tensor(reqs, torch.int32),
+                                 self._device_tensor(tok_ids, torch.int32))
+
+    def _device_tensor(self, data: list, dtype: torch.dtype) -> torch.Tensor:
+        return (torch.tensor(data,
+                             device="cpu",
+                             dtype=dtype,
+                             pin_memory=self.pin_memory).to(device=self.device,
+                                                            non_blocking=True))
+
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        if self.min_toks:
+            # Inhibit EOS token for requests which have not reached min length
+            logits[self.logits_slice] = -float("inf")
+        return logits
+
+
+def init_builtin_logitsprocs(pin_memory_available: bool, max_num_reqs: int,
+                             device: torch.device) -> LogitsProcessorManager:
+    """Construct 'builtin' vLLM logitsprocs which the engine
+    loads by default.
+
+    Args:
+      pin_memory_available: pinned memory is available for use
+                            for use by logitsproc
+      max_num_reqs: ceiling on request count in persistent batch
+      device: inference device
+
+    Returns:
+      Data structure encapsulating loaded logitsprocs
+    """
+    min_tokens_logitproc = MinTokensLogitsProcessor(
+        pin_memory=pin_memory_available, device=device)
+    logit_bias_logitproc = LogitBiasLogitsProcessor(
+        pin_memory=pin_memory_available, device=device)
+    min_p_logitproc = MinPLogitsProcessor(
+        pin_memory=pin_memory_available,
+        device=device,
+        # +1 for temporary swap space
+        max_num_reqs=max_num_reqs + 1)
+    return LogitsProcessorManager(
+        non_argmax_invariant=[
+            min_tokens_logitproc,
+            logit_bias_logitproc,
+        ],
+        argmax_invariant=[min_p_logitproc],
+    )
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index ab13b288a5a9b..1189b12f30776 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -6,6 +6,8 @@ from typing import Optional
 
 import torch
 
+from vllm.v1.sample.logits_processor import LogitsProcessorManager
+
 
 @dataclass
 class SamplingMetadata:
@@ -16,7 +18,6 @@ class SamplingMetadata:
 
     top_p: Optional[torch.Tensor]
     top_k: Optional[torch.Tensor]
-    min_p: Optional[torch.Tensor]
 
     generators: dict[int, torch.Generator]
 
@@ -31,14 +32,12 @@ class SamplingMetadata:
 
     output_token_ids: list[list[int]]
 
-    # req_index -> (min_tokens, stop_token_ids)
-    min_tokens: dict[int, tuple[int, set[int]]]
-
-    logit_bias: list[Optional[dict[int, float]]]
-
     # `allowed_token_ids_mask` is a 2D bool tensor of shape (max batch size,
     # vocab size).
     allowed_token_ids_mask: Optional[torch.Tensor]
 
     # req_index -> bad_words_token_ids
     bad_words_token_ids: dict[int, list[list[int]]]
+
+    # Loaded logits processors
+    logitsprocs: LogitsProcessorManager
diff --git a/vllm/v1/sample/ops/penalties.py b/vllm/v1/sample/ops/penalties.py
index 48423b9b424dd..5d54f6679a1a9 100644
--- a/vllm/v1/sample/ops/penalties.py
+++ b/vllm/v1/sample/ops/penalties.py
@@ -7,22 +7,6 @@ from vllm.model_executor.layers.utils import apply_penalties
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
 
 
-def apply_min_token_penalties(
-        logits: torch.Tensor, output_token_ids: list[list[int]],
-        min_tokens: dict[int, tuple[int, set[int]]]) -> None:
-    """
-    Applies minimum token penalty by setting the logits of the stop tokens
-    to -inf.
-    """
-    min_tokens_logits_to_penalize: list[tuple[int, int]] = []
-    for index, (min_token, stop_token_ids) in min_tokens.items():
-        if len(output_token_ids[index]) < min_token:
-            for stop_token_id in stop_token_ids:
-                min_tokens_logits_to_penalize.append((index, stop_token_id))
-    if min_tokens_logits_to_penalize:
-        logits[tuple(zip(*min_tokens_logits_to_penalize))] = -float("inf")
-
-
 def apply_all_penalties(
     logits: torch.Tensor,
     prompt_token_ids: torch.Tensor,
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 6bc0cecdd4940..e79e4451a3a3f 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -5,12 +5,11 @@
 import torch
 import torch.nn as nn
 
-from vllm.utils import async_tensor_h2d, is_pin_memory_available
+from vllm.utils import is_pin_memory_available
 from vllm.v1.outputs import LogprobsTensors, SamplerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.ops.bad_words import apply_bad_words
-from vllm.v1.sample.ops.penalties import (apply_all_penalties,
-                                          apply_min_token_penalties)
+from vllm.v1.sample.ops.penalties import apply_all_penalties
 from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler
 
 _SAMPLING_EPS = 1e-5
@@ -44,8 +43,11 @@ class Sampler(nn.Module):
         logits = self.apply_allowed_token_ids(logits, sampling_metadata)
         # Apply bad words exclusion.
         logits = self.apply_bad_words(logits, sampling_metadata)
-        # Apply logits bias.
-        logits = self.apply_logits_bias(logits, sampling_metadata)
+
+        # Apply logits processors which can impact greedy sampling
+        for processor in (sampling_metadata.logitsprocs.non_argmax_invariant):
+            logits = processor.apply(logits)
+
         # Apply penalties (e.g., min_tokens, freq_penalties).
         logits = self.apply_penalties(logits, sampling_metadata)
         # Sample the next token.
@@ -110,9 +112,10 @@ class Sampler(nn.Module):
         # Apply temperature.
         logits = self.apply_temperature(logits, sampling_metadata.temperature)
 
-        # Apply min_p.
-        if sampling_metadata.min_p is not None:
-            logits = self.apply_min_p(logits, sampling_metadata.min_p)
+        # Apply logits processors that only apply to random sampling
+        # (argmax invariant)
+        for processor in sampling_metadata.logitsprocs.argmax_invariant:
+            logits = processor.apply(logits)
 
         # Apply top_k and/or top_p.
         random_sampled = self.topk_topp_sampler(
@@ -187,10 +190,6 @@ class Sampler(nn.Module):
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> torch.Tensor:
-        if sampling_metadata.min_tokens:
-            apply_min_token_penalties(logits,
-                                      sampling_metadata.output_token_ids,
-                                      sampling_metadata.min_tokens)
         if not sampling_metadata.no_penalties:
             assert sampling_metadata.prompt_token_ids is not None
             logits = apply_all_penalties(
@@ -203,65 +202,6 @@ class Sampler(nn.Module):
             )
         return logits
 
-    def apply_min_p(
-        self,
-        logits: torch.Tensor,
-        min_p: torch.Tensor,
-    ) -> torch.Tensor:
-        """
-        Filters logits using adaptive probability thresholding.
-        """
-        # Convert logits to probability distribution
-        probability_values = torch.nn.functional.softmax(logits, dim=-1)
-        # Calculate maximum probabilities per sequence
-        max_probabilities = torch.amax(probability_values,
-                                       dim=-1,
-                                       keepdim=True)
-        # Reshape min_p for broadcasting
-        adjusted_min_p = min_p.unsqueeze(1) * max_probabilities
-        # Identify valid tokens using threshold comparison
-        valid_token_mask = probability_values >= adjusted_min_p
-        # Apply mask using boolean indexing
-        logits[~valid_token_mask] = -float('inf')
-        return logits
-
-    def apply_logits_bias(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> torch.Tensor:
-        # TODO(houseroad): this implementation is extremely inefficient.
-        # One idea is implement this as a PyTorch C++ op, and we may
-        # even optimize the logit_bias layout.
-
-        rows: list[int] = []
-        cols: list[int] = []
-        vals: list[float] = []
-
-        # Get vocabulary size from logits
-        vocab_size = logits.shape[-1]
-
-        for i, logit_bias in enumerate(sampling_metadata.logit_bias):
-            if logit_bias:
-                for token_id, bias in logit_bias.items():
-                    # Check token_id bounds to ensure within vocabulary
-                    if token_id < 0 or token_id >= vocab_size:
-                        raise ValueError(
-                            f"token_id {token_id} in logit_bias contains "
-                            f"out-of-vocab token id. Vocabulary size: "
-                            f"{vocab_size}")
-                    rows.append(i)
-                    cols.append(token_id)
-                    vals.append(bias)
-
-        if rows:
-            indices = async_tensor_h2d([rows, cols], torch.int64,
-                                       logits.device, self.pin_memory)
-            values = async_tensor_h2d(vals, torch.float, logits.device,
-                                      self.pin_memory)
-            logits.index_put_(tuple(indices), values=values, accumulate=True)
-        return logits
-
     def apply_allowed_token_ids(
         self,
         logits: torch.Tensor,
diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py
index 5c37333cebc7a..3a86fea146f33 100644
--- a/vllm/v1/spec_decode/utils.py
+++ b/vllm/v1/spec_decode/utils.py
@@ -1,23 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.sampling_params import SamplingParams
 from vllm.triton_utils import tl, triton
-from vllm.v1.worker.gpu_input_batch import InputBatch
+
+_SAMPLING_EPS = 1e-5
 
 
-def is_spec_decode_supported(req_id: str, input_batch: InputBatch) -> bool:
-    if req_id in input_batch.min_p_reqs:
-        # Spec decode doesn't support min_p sampling.
-        return False
-    elif (req_id in input_batch.frequency_penalties_reqs
-          or req_id in input_batch.presence_penalties_reqs
-          or req_id in input_batch.repetition_penalties_reqs):
-        # Spec decode doesn't support penalties.
-        return False
-    elif req_id in input_batch.num_logprobs:
-        # Spec decode doesn't support logprobs.
-        return False
-
-    return True
+def is_spec_decode_unsupported(sampling_params: SamplingParams) -> bool:
+    """True if request is incompatible with speculative decoding"""
+    return (sampling_params.frequency_penalty != 0.0
+            or sampling_params.presence_penalty != 0.0
+            or sampling_params.repetition_penalty != 1.0
+            or sampling_params.min_p > _SAMPLING_EPS
+            or sampling_params.logprobs is not None)
 
 
 @triton.jit
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index ca2bfe8317468..1a79d72be0a9b 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -15,12 +15,14 @@ from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.pool.metadata import PoolingMetadata
+from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
+                                             MoveDirectionality,
+                                             init_builtin_logitsprocs)
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
 from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import MultiGroupBlockTable
 
-_SAMPLING_EPS = 1e-5
-
 
 @dataclass
 class CachedRequestState:
@@ -67,8 +69,10 @@ class InputBatch:
         pin_memory: bool,
         vocab_size: int,
         block_sizes: list[int],  # The block_size of each kv cache group
+        is_spec_decode: bool = False,
         logits_processing_needs_token_ids: bool = False,
     ):
+        self.is_spec_decode = is_spec_decode
         self.max_num_reqs = max_num_reqs
         self.max_model_len = max_model_len
         self.max_num_batched_tokens = max_num_batched_tokens
@@ -146,15 +150,8 @@ class InputBatch:
         self.top_k_cpu = self.top_k_cpu_tensor.numpy()
         self.top_k_reqs: set[str] = set()
 
-        self.min_p = torch.empty((max_num_reqs, ),
-                                 dtype=torch.float32,
-                                 device=device)
-        self.min_p_cpu_tensor = torch.empty((max_num_reqs, ),
-                                            dtype=torch.float32,
-                                            device="cpu",
-                                            pin_memory=pin_memory)
-        self.min_p_cpu = self.min_p_cpu_tensor.numpy()
-        self.min_p_reqs: set[str] = set()
+        # IDs of requests which do not support spec decoding
+        self.spec_decode_unsupported_reqs: set[str] = set()
 
         # Frequency penalty related data structures
         self.frequency_penalties = torch.empty((max_num_reqs, ),
@@ -194,9 +191,6 @@ class InputBatch:
             self.repetition_penalties_cpu_tensor.numpy()
         self.repetition_penalties_reqs: set[str] = set()
 
-        # req_index -> (min_tokens, stop_token_ids)
-        self.min_tokens: dict[int, tuple[int, set[int]]] = {}
-
         # lora related
         self.request_lora_mapping = np.zeros((self.max_num_reqs, ),
                                              dtype=np.int32)
@@ -216,8 +210,20 @@ class InputBatch:
         # To accumulate prompt logprobs tensor chunks across prefill steps.
         self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
 
-        self.logit_bias: list[Optional[dict[int,
-                                            float]]] = [None] * max_num_reqs
+        # Internal representation of per-step batch state changes, used for
+        # reordering persistent batch and generating logitsprocs batch state
+        # updates. Should reset each step.
+        self.batch_update_builder = BatchUpdateBuilder()
+
+        # Define logits processors.
+        # TODO(andy): logits processor list should be extensible via engine
+        # constructor argument; for now the list is fixed.
+        self.logitsprocs = init_builtin_logitsprocs(
+            pin_memory_available=pin_memory,
+            max_num_reqs=max_num_reqs + 1,
+            device=device)
+
+        # TODO convert this to LogitsProcessor
         self.has_allowed_token_ids: set[str] = set()
         # NOTE(lufang): In the mask tensor, if the corresponding token allowed,
         # the value is False. Since we use masked_fill_ to set -inf.
@@ -240,14 +246,28 @@ class InputBatch:
         # while performing state updates to the batch.
         return cast(list[str], self._req_ids)
 
+    def _get_next_add_index(self) -> int:
+        if (req_index := self.batch_update_builder.pop_removed()) is not None:
+            # Fill the empty index.
+            return req_index
+        # Append to end
+        return self.num_reqs
+
+    def _register_add_request(self, request: "CachedRequestState") -> int:
+        """Track add-request operations"""
+        req_index = self._get_next_add_index()
+        assert req_index < self.max_num_reqs
+        params = (request.sampling_params
+                  if request.sampling_params else request.pooling_params)
+        self.batch_update_builder.added.append(
+            (req_index, params, request.output_token_ids))
+        return req_index
+
     def add_request(
         self,
         request: "CachedRequestState",
-        req_index: Optional[int] = None,
-    ) -> None:
-        if req_index is None:
-            req_index = self.num_reqs
-        assert req_index < self.max_num_reqs
+    ) -> int:
+        req_index = self._register_add_request(request)
 
         req_id = request.req_id
         if req_index == len(self._req_ids):
@@ -278,6 +298,9 @@ class InputBatch:
         self.block_table.add_row(request.block_ids, req_index)
 
         if sampling_params := request.sampling_params:
+            if (self.is_spec_decode
+                    and is_spec_decode_unsupported(sampling_params)):
+                self.spec_decode_unsupported_reqs.add(req_id)
             if sampling_params.sampling_type == SamplingType.GREEDY:
                 # Avoid later division by zero.
                 self.temperature_cpu[req_index] = -1.0
@@ -295,11 +318,8 @@ class InputBatch:
             else:
                 top_k = self.vocab_size
             self.top_k_cpu[req_index] = top_k
-            self.min_p_cpu[req_index] = sampling_params.min_p
             self.frequency_penalties_cpu[
                 req_index] = sampling_params.frequency_penalty
-            if sampling_params.min_p > _SAMPLING_EPS:
-                self.min_p_reqs.add(req_id)
             if sampling_params.frequency_penalty != 0.0:
                 self.frequency_penalties_reqs.add(req_id)
             self.presence_penalties_cpu[
@@ -310,10 +330,6 @@ class InputBatch:
                 req_index] = sampling_params.repetition_penalty
             if sampling_params.repetition_penalty != 1.0:
                 self.repetition_penalties_reqs.add(req_id)
-            if sampling_params.min_tokens:
-                self.min_tokens[req_index] = (
-                    sampling_params.min_tokens,
-                    sampling_params.all_stop_token_ids)
 
             # NOTE(woosuk): self.generators should not include the requests that
             # do not have their own generator.
@@ -325,8 +341,6 @@ class InputBatch:
             if sampling_params.prompt_logprobs is not None:
                 self.num_prompt_logprobs[
                     req_id] = sampling_params.prompt_logprobs
-            if sampling_params.logit_bias is not None:
-                self.logit_bias[req_index] = sampling_params.logit_bias
 
             if sampling_params.allowed_token_ids:
                 self.has_allowed_token_ids.add(req_id)
@@ -368,12 +382,22 @@ class InputBatch:
             # No LoRA
             self.request_lora_mapping[req_index] = 0
 
+        return req_index
+
     def remove_request(self, req_id: str) -> Optional[int]:
-        """This method must always be followed by a call to condense()."""
+        """This method must always be followed by a call to condense().
+        
+        Args:
+          req_id: request to remove
+
+        Returns:
+          Removed request index, or `None` if `req_id` not recognized
+        """
 
         req_index = self.req_id_to_index.pop(req_id, None)
         if req_index is None:
             return None
+        self.batch_update_builder.removed_append(req_index)
         self._req_ids[req_index] = None
         self.req_output_token_ids[req_index] = None
 
@@ -381,8 +405,7 @@ class InputBatch:
         self.random_reqs.discard(req_id)
         self.top_p_reqs.discard(req_id)
         self.top_k_reqs.discard(req_id)
-        self.min_p_reqs.discard(req_id)
-        self.min_tokens.pop(req_index, None)
+        self.spec_decode_unsupported_reqs.discard(req_id)
         self.frequency_penalties_reqs.discard(req_id)
         self.presence_penalties_reqs.discard(req_id)
         self.repetition_penalties_reqs.discard(req_id)
@@ -400,7 +423,6 @@ class InputBatch:
                 self.lora_id_to_lora_request.pop(lora_id)
             self.request_lora_mapping[req_index] = 0
 
-        self.logit_bias[req_index] = None
         self.has_allowed_token_ids.discard(req_id)
         if self.allowed_token_ids_mask_cpu_tensor is not None:
             # False means we don't fill with -inf.
@@ -410,6 +432,8 @@ class InputBatch:
         return req_index
 
     def swap_states(self, i1: int, i2: int) -> None:
+        self.batch_update_builder.moved.append(
+            (i1, i2, MoveDirectionality.SWAP))
         old_id_i1 = self._req_ids[i1]
         old_id_i2 = self._req_ids[i2]
         self._req_ids[i1], self._req_ids[i2] =\
@@ -439,8 +463,6 @@ class InputBatch:
             self.presence_penalties_cpu[i2], self.presence_penalties_cpu[i1]
         self.repetition_penalties_cpu[i1], self.repetition_penalties_cpu[i2] =\
             self.repetition_penalties_cpu[i2], self.repetition_penalties_cpu[i1]
-        self.min_p_cpu[i1], self.min_p_cpu[i2] =\
-            self.min_p_cpu[i2], self.min_p_cpu[i1]
 
         # NOTE: the following is unsafe
         # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
@@ -452,13 +474,10 @@ class InputBatch:
         self.token_ids_cpu[i2, ...] = tmp
 
         swap_dict_values(self.generators, i1, i2)
-        swap_dict_values(self.min_tokens, i1, i2)
         swap_dict_values(self.bad_words_token_ids, i1, i2)
 
         self.request_lora_mapping[i1], self.request_lora_mapping[i2] =\
             self.request_lora_mapping[i2], self.request_lora_mapping[i1]
-        self.logit_bias[i1], self.logit_bias[i2] =\
-            self.logit_bias[i2], self.logit_bias[i1]
 
         if self.allowed_token_ids_mask_cpu_tensor is not None:
             self.allowed_token_ids_mask_cpu_tensor[i1], \
@@ -467,12 +486,23 @@ class InputBatch:
                     self.allowed_token_ids_mask_cpu_tensor[i1]
         self.block_table.swap_row(i1, i2)
 
-    def condense(self, empty_req_indices: list[int]) -> None:
-        """Move non-empty requests down into lower, empty indices.
-        
+    def condense(self) -> None:
+        """Slide non-empty requests down into lower, empty indices.
+
+        Any consecutive empty indices at the very end of the list are not
+        filled.
+
         Args:
-          empty_req_indices: empty batch indices, sorted descending.
+          empty_req_indices: empty indices which may be filled.
+
+        Returns:
+          swaps: list of (from,to) swap tuples for moved requests
+          empty_req_indices: indices not filled by condensation
         """
+        if not (empty_req_indices := self.batch_update_builder.removed):
+            # All removed requests were replaced by added requests, or else no
+            # requests were removed at all. No condense() needed
+            return
         num_reqs = self.num_reqs
         if num_reqs == 0:
             # The batched states are empty.
@@ -489,11 +519,17 @@ class InputBatch:
                 last_req_index -= 1
 
             # Find the smallest empty index.
-            empty_index = empty_req_indices.pop()
+            empty_index = self.batch_update_builder.peek_removed()
+            assert empty_index is not None
             if empty_index >= last_req_index:
                 break
 
-            # Swap the states.
+            # Move active request down into empty request
+            # index.
+            self.batch_update_builder.pop_removed()
+            self.batch_update_builder.moved.append(
+                (last_req_index, empty_index,
+                 MoveDirectionality.UNIDIRECTIONAL))
             req_id = self._req_ids[last_req_index]
             output_token_ids = self.req_output_token_ids[last_req_index]
             assert req_id is not None
@@ -524,20 +560,14 @@ class InputBatch:
                 empty_index] = self.presence_penalties_cpu[last_req_index]
             self.repetition_penalties_cpu[
                 empty_index] = self.repetition_penalties_cpu[last_req_index]
-            self.min_p_cpu[empty_index] = self.min_p_cpu[last_req_index]
             generator = self.generators.pop(last_req_index, None)
             if generator is not None:
                 self.generators[empty_index] = generator
 
-            min_token = self.min_tokens.pop(last_req_index, None)
-            if min_token is not None:
-                self.min_tokens[empty_index] = min_token
-
             self.request_lora_mapping[empty_index] = self.request_lora_mapping[
                 last_req_index]
 
-            self.logit_bias[empty_index] = self.logit_bias[last_req_index]
-
+            # TODO convert these to LogitsProcessors
             if self.allowed_token_ids_mask_cpu_tensor is not None:
                 self.allowed_token_ids_mask_cpu_tensor[
                     empty_index] = self.allowed_token_ids_mask_cpu_tensor[
@@ -547,6 +577,7 @@ class InputBatch:
                 last_req_index, None)
             if bad_words_token_ids is not None:
                 self.bad_words_token_ids[empty_index] = bad_words_token_ids
+
             # Decrement last_req_index since it is now empty.
             last_req_index -= 1
 
@@ -554,8 +585,17 @@ class InputBatch:
         del self._req_ids[self.num_reqs:]
         del self.req_output_token_ids[self.num_reqs:]
 
-    def refresh_sampling_metadata(self):
-        self.sampling_metadata = self._make_sampling_metadata()
+    def refresh_metadata(self):
+        """Apply batch updates, reset input batch at end of step
+        
+        * Apply batch add/remove/permute to logits procs' states
+        * If batch state is modified, update sampling metadata
+        """
+        batch_update = self.batch_update_builder.get_and_reset(self.num_reqs)
+        for logit_proc in self.logitsprocs.all:
+            logit_proc.update_state(batch_update)
+        if batch_update:
+            self.sampling_metadata = self._make_sampling_metadata()
 
     def _make_sampling_metadata(self) -> SamplingMetadata:
         num_reqs = self.num_reqs
@@ -568,8 +608,6 @@ class InputBatch:
             copy_slice(self.top_p_cpu_tensor, self.top_p, num_reqs)
         if not self.no_top_k:
             copy_slice(self.top_k_cpu_tensor, self.top_k, num_reqs)
-        if not self.no_min_p:
-            copy_slice(self.min_p_cpu_tensor, self.min_p, num_reqs)
 
         if not self.no_penalties:
             # Since syncing these tensors is expensive only copy them
@@ -607,7 +645,6 @@ class InputBatch:
             all_random=self.all_random,
             top_p=None if self.no_top_p else self.top_p[:num_reqs],
             top_k=None if self.no_top_k else self.top_k[:num_reqs],
-            min_p=None if self.no_min_p else self.min_p[:num_reqs],
             generators=self.generators,
             max_num_logprobs=self.max_num_logprobs,
             prompt_token_ids=prompt_token_ids,
@@ -615,11 +652,10 @@ class InputBatch:
             presence_penalties=self.presence_penalties[:num_reqs],
             repetition_penalties=self.repetition_penalties[:num_reqs],
             output_token_ids=cast(list[list[int]], self.req_output_token_ids),
-            min_tokens=self.min_tokens,
             no_penalties=self.no_penalties,
-            logit_bias=self.logit_bias[:num_reqs],
             allowed_token_ids_mask=allowed_token_ids_mask,
             bad_words_token_ids=self.bad_words_token_ids,
+            logitsprocs=self.logitsprocs,
         )
 
     @property
@@ -702,10 +738,6 @@ class InputBatch:
     def no_top_k(self) -> bool:
         return len(self.top_k_reqs) == 0
 
-    @property
-    def no_min_p(self) -> bool:
-        return len(self.min_p_reqs) == 0
-
     @property
     def no_penalties(self) -> bool:
         return (len(self.presence_penalties_reqs) == 0
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index df9d69006fc57..4786d047acb5a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -63,12 +63,12 @@ from vllm.v1.spec_decode.eagle import EagleProposer
 from vllm.v1.spec_decode.medusa import MedusaProposer
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
-from vllm.v1.spec_decode.utils import is_spec_decode_supported
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.block_table import BlockTable
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
+from ..sample.logits_processor import LogitsProcessorManager
 from .utils import (gather_mm_placeholders, initialize_kv_cache_for_kv_sharing,
                     sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
 
@@ -212,6 +212,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             pin_memory=self.pin_memory,
             vocab_size=self.model_config.get_vocab_size(),
             block_sizes=[self.cache_config.block_size],
+            is_spec_decode=bool(self.vllm_config.speculative_config),
         )
 
         self.use_cuda_graph = (
@@ -316,7 +317,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         # from the KV cache of `shared_kv_cache_layers[layer_name]`.
         self.shared_kv_cache_layers: dict[str, str] = {}
 
-    def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool:
+    def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
         """
         Update the order of requests in the batch based on the attention
         backend's needs. For example, some attention backends (namely MLA) may
@@ -325,21 +326,18 @@ class GPUModelRunner(LoRAModelRunnerMixin):
 
         Args:
             scheduler_output: The scheduler output.
-
-        Returns:
-            True if the batch was reordered, False otherwise.
         """
-        batch_reordered = self.attn_metadata_builders[0].reorder_batch(
-            self.input_batch, scheduler_output)
+        self.attn_metadata_builders[0].reorder_batch(self.input_batch,
+                                                     scheduler_output)
 
         # For models with multiple KV cache groups, the groups should agree on
         # the same order of requests. We ensure this by only allowing the first
         # group to reorder the batch and asserting that all other groups do not
         # reorder the batch.
         for i in range(1, len(self.kv_cache_config.kv_cache_groups)):
-            assert not self.attn_metadata_builders[i].reorder_batch(
+            batch_reordered = self.attn_metadata_builders[i].reorder_batch(
                 self.input_batch, scheduler_output)
-        return batch_reordered
+            assert not batch_reordered
 
     # Note: used for model runner override.
     def _init_device_properties(self) -> None:
@@ -372,11 +370,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         # then resubmitted with the same ID. In this case, we treat them as two
         # distinct requests - clearing the cached states for the first request
         # and handling the second as a new request.
-        removed_req_indices: list[int] = []
         for req_id in scheduler_output.finished_req_ids:
-            req_index = self.input_batch.remove_request(req_id)
-            if req_index is not None:
-                removed_req_indices.append(req_index)
+            self.input_batch.remove_request(req_id)
 
         # Free the cached encoder outputs.
         for req_id, input_id in scheduler_output.free_encoder_input_ids:
@@ -399,9 +394,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         # have low request overlap (e.g., alternating between two distinct
         # sets of requests), this optimization becomes very inefficient.
         for req_id in unscheduled_req_ids:
-            req_index = self.input_batch.remove_request(req_id)
-            assert req_index is not None
-            removed_req_indices.append(req_index)
+            self.input_batch.remove_request(req_id)
 
         req_ids_to_add: list[str] = []
         # Add new requests to the cached states.
@@ -545,31 +538,18 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 # NOTE(woosuk): `num_tokens` here may include spec tokens.
                 self.input_batch.num_tokens[req_index] = end_token_index
 
-        # Check if the batch has changed. If not, we can skip copying the
-        # sampling metadata from CPU to GPU.
-        batch_changed = len(removed_req_indices) > 0 or len(req_ids_to_add) > 0
-
         # Add the new or resumed requests to the persistent batch.
         # The smaller empty indices are filled first.
-        removed_req_indices.sort(reverse=True)
         for req_id in req_ids_to_add:
             req_state = self.requests[req_id]
-            if removed_req_indices:
-                # Fill the empty index.
-                req_index = removed_req_indices.pop()
-            else:
-                # Append to the end.
-                req_index = None
-            self.input_batch.add_request(req_state, req_index)
+            self.input_batch.add_request(req_state)
 
-        # Condense the batched states if there are empty indices.
-        if removed_req_indices:
-            self.input_batch.condense(removed_req_indices)
-
-        batch_reordered = self._may_reorder_batch(scheduler_output)
-
-        if batch_changed or batch_reordered:
-            self.input_batch.refresh_sampling_metadata()
+        # Condense the batched states if there are gaps left by removed requests
+        self.input_batch.condense()
+        # Allow attention backend to reorder the batch, potentially
+        self._may_reorder_batch(scheduler_output)
+        # Refresh batch metadata with any pending updates.
+        self.input_batch.refresh_metadata()
 
     def _get_cumsum_and_arange(
         self,
@@ -1296,7 +1276,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         scheduler_output: "SchedulerOutput",
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> Union[ModelRunnerOutput, IntermediateTensors]:
-
         self._update_states(scheduler_output)
         if not scheduler_output.total_num_scheduled_tokens:
             if not has_kv_transfer_group():
@@ -1765,7 +1744,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             # Skip requests that require sampling parameters that are not
             # supported with speculative decoding.
             req_id = self.input_batch.req_ids[i]
-            if not is_spec_decode_supported(req_id, self.input_batch):
+            if req_id in self.input_batch.spec_decode_unsupported_reqs:
                 draft_token_ids.append([])
                 continue
 
@@ -2121,7 +2100,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             all_random=False,
             top_p=dummy_tensors(0.9),
             top_k=dummy_tensors(logits.size(1) - 1),
-            min_p=None,
             generators={},
             max_num_logprobs=None,
             no_penalties=True,
@@ -2130,10 +2108,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             presence_penalties=dummy_tensors(0.1),
             repetition_penalties=dummy_tensors(0.1),
             output_token_ids=[[] for _ in range(num_reqs)],
-            min_tokens={},
-            logit_bias=[None for _ in range(num_reqs)],
             allowed_token_ids_mask=None,
             bad_words_token_ids={},
+            logitsprocs=LogitsProcessorManager(),
         )
         try:
             sampler_output = self.sampler(logits=logits,
@@ -2425,6 +2402,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 pin_memory=self.pin_memory,
                 vocab_size=self.model_config.get_vocab_size(),
                 block_sizes=block_sizes,
+                is_spec_decode=bool(self.vllm_config.speculative_config),
             )
 
     def _allocate_kv_cache_tensors(

From d265414dbc9f819ee8f20a69f735a4bf0a667680 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Wed, 2 Jul 2025 17:13:37 +0100
Subject: [PATCH 052/167] [Minor] Clean up incorrect comment in test (#20382)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/engine/test_options.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/engine/test_options.py b/tests/engine/test_options.py
index fc6a78a5112a1..42e88e84770ab 100644
--- a/tests/engine/test_options.py
+++ b/tests/engine/test_options.py
@@ -48,9 +48,6 @@ def test_enable_prompt_embeds(hf_runner, model: str,
     ctx = (nullcontext() if enable_prompt_embeds else pytest.raises(
         ValueError, match="set `--enable-prompt-embeds`"))
 
-    # This test checks if the flag skip_tokenizer_init skips the initialization
-    # of tokenizer and detokenizer. The generated output is expected to contain
-    # token ids.
     llm = LLM(
         model=model,
         enable_prompt_embeds=enable_prompt_embeds,

From 139508a418a4185d50eef087d054b8115cb77542 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Thu, 3 Jul 2025 00:14:31 +0800
Subject: [PATCH 053/167] [Misc] add handler HF_TOKEN is emptry string (#20369)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 vllm/transformers_utils/config.py | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 52a7a903cd8ef..701bb38810f6a 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -56,6 +56,22 @@ MISTRAL_CONFIG_NAME = "params.json"
 
 logger = init_logger(__name__)
 
+
+def _get_hf_token() -> Optional[str]:
+    """
+    Get the HuggingFace token from environment variable.
+
+    Returns None if the token is not set, is an empty string, 
+    or contains only whitespace.
+    This follows the same pattern as huggingface_hub library which
+    treats empty string tokens as None to avoid authentication errors.
+    """
+    token = os.getenv('HF_TOKEN')
+    if token and token.strip():
+        return token
+    return None
+
+
 _CONFIG_REGISTRY_OVERRIDE_HF: dict[str, type[PretrainedConfig]] = {
     "mllama": MllamaConfig
 }
@@ -195,7 +211,7 @@ def file_or_path_exists(model: Union[str, Path], config_name: str,
     return file_exists(str(model),
                        config_name,
                        revision=revision,
-                       token=os.getenv('HF_TOKEN', None))
+                       token=_get_hf_token())
 
 
 def patch_rope_scaling(config: PretrainedConfig) -> None:
@@ -322,7 +338,7 @@ def get_config(
             model,
             revision=revision,
             code_revision=code_revision,
-            token=os.getenv('HF_TOKEN', None),
+            token=_get_hf_token(),
             **kwargs,
         )
 
@@ -334,7 +350,7 @@ def get_config(
                 model,
                 revision=revision,
                 code_revision=code_revision,
-                token=os.getenv('HF_TOKEN', None),
+                token=_get_hf_token(),
                 **kwargs,
             )
         else:
@@ -344,7 +360,7 @@ def get_config(
                     trust_remote_code=trust_remote_code,
                     revision=revision,
                     code_revision=code_revision,
-                    token=os.getenv('HF_TOKEN', None),
+                    token=_get_hf_token(),
                     **kwargs,
                 )
             except ValueError as e:
@@ -571,7 +587,7 @@ def get_sentence_transformer_tokenizer_config(model: str,
             # If model is on HuggingfaceHub, get the repo files
             repo_files = list_repo_files(model,
                                          revision=revision,
-                                         token=os.getenv('HF_TOKEN', None))
+                                         token=_get_hf_token())
         except Exception:
             repo_files = []
 
@@ -862,7 +878,7 @@ def try_get_safetensors_metadata(
         get_safetensors_metadata,
         model,
         revision=revision,
-        token=os.getenv('HF_TOKEN', None),
+        token=_get_hf_token(),
     )
 
     try:

From a1aafc827a2a4c8783bdbc480eb709378dc9644a Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Thu, 3 Jul 2025 00:25:46 +0800
Subject: [PATCH 054/167] [ROCm][FEAT] Enable Full Graph Mode in AITER MLA V1
 Attn Backend (Decode Phase only) (#20254)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .../attention/backends/mla/rocm_aiter_mla.py  | 88 ++++++++++++-------
 1 file changed, 58 insertions(+), 30 deletions(-)

diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 8ad4e542b45b6..d5f9dfaea0655 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import Any, Optional
+from typing import Any, ClassVar, Optional
 
 import torch
 
@@ -63,6 +63,7 @@ class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]):
 
 
 class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
+    full_cudagraph_supported: ClassVar[bool] = True  # decode only
 
     def __init__(self, runner, kv_cache_spec: AttentionSpec,
                  block_table: BlockTable):
@@ -70,56 +71,83 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
         assert self.kv_cache_spec.block_size == 1, "AITER MLA" \
             "only supports block size 1."
 
-    def _get_paged_kv_tensors(
-            self, block_table: torch.Tensor,
-            seq_lens: torch.Tensor) -> tuple[torch.Tensor, ...]:
+        # Preparing persistent buffers
+        if self.runner.full_cuda_graph:
+            device = self.runner.device
+            max_num_reqs = self.runner.max_num_reqs
+            self.paged_kv_indptr = torch.zeros(max_num_reqs + 1,
+                                               dtype=torch.int32,
+                                               device=device)
+            self.paged_kv_indices = torch.zeros(
+                block_table.get_device_tensor().numel(
+                ),  # max num pages possible
+                dtype=torch.int32,
+                device=device)
+            self.paged_kv_last_page_len = torch.zeros(max_num_reqs,
+                                                      dtype=torch.int32,
+                                                      device=device)
+
+            self.qo_indptr = torch.arange(0,
+                                          max_num_reqs + 1,
+                                          dtype=torch.int32,
+                                          device=device)
+
+    def _build_decode(self, block_table_tensor: torch.Tensor,
+                      seq_lens: torch.Tensor) -> AiterMLADecodeMetadata:
         page_size = self.kv_cache_spec.block_size
         block_table_bounds = (seq_lens + page_size - 1) // page_size
         device = self.runner.device
 
-        mask = (torch.arange(block_table.size(1),
-                             dtype=block_table.dtype,
+        mask = (torch.arange(block_table_tensor.size(1),
+                             dtype=block_table_tensor.dtype,
                              device=device).unsqueeze(0)
                 < block_table_bounds.unsqueeze(1))
-        paged_kv_indices = block_table[mask]
+        paged_kv_indices = block_table_tensor[mask]
+
+        paged_kv_last_page_len = seq_lens % page_size
+        paged_kv_last_page_len = torch.where(paged_kv_last_page_len == 0,
+                                             page_size, paged_kv_last_page_len)
 
         paged_kv_indptr = torch.cat([
             torch.zeros(1, dtype=block_table_bounds.dtype, device=device),
             block_table_bounds.cumsum(dim=0, dtype=torch.int32)
         ])
 
-        paged_kv_last_page_len = seq_lens % page_size
-        paged_kv_last_page_len = torch.where(paged_kv_last_page_len == 0,
-                                             page_size, paged_kv_last_page_len)
-        qo_indptr = torch.arange(0,
-                                 self._num_decodes + 1,
-                                 step=1,
-                                 dtype=torch.int32,
-                                 device=device)
+        if self.runner.full_cuda_graph:
+            num_reqs = self._num_decodes
 
-        return (
-            paged_kv_indices,
-            paged_kv_indptr,
-            paged_kv_last_page_len,
-            qo_indptr,
-        )
+            num_actual_pages = paged_kv_indices.size(0)
 
-    def _build_decode(self, block_table_tensor: torch.Tensor,
-                      seq_lens: torch.Tensor) -> AiterMLADecodeMetadata:
+            self.paged_kv_indices[:num_actual_pages].copy_(paged_kv_indices,
+                                                           non_blocking=True)
+            self.paged_kv_indices[num_actual_pages:].fill_(-1)
+            paged_kv_indices = self.paged_kv_indices[:num_actual_pages]
 
-        (
-            paged_kv_indices,
-            paged_kv_indptr,
-            paged_last_page_len,
-            qo_indptr,
-        ) = self._get_paged_kv_tensors(block_table_tensor, seq_lens)
+            self.paged_kv_indptr[:1 + num_reqs].copy_(paged_kv_indptr,
+                                                      non_blocking=True)
+            self.paged_kv_indptr[1 + num_reqs:].fill_(paged_kv_indptr[-1])
+            paged_kv_indptr = self.paged_kv_indptr[:1 + num_reqs]
+
+            self.paged_kv_last_page_len[:num_reqs].copy_(
+                paged_kv_last_page_len, non_blocking=True)
+            self.paged_kv_last_page_len[num_reqs:].fill_(1)
+            paged_kv_last_page_len = self.paged_kv_last_page_len[:num_reqs]
+
+            qo_indptr = self.qo_indptr[:1 + num_reqs]
+
+        else:
+            qo_indptr = torch.arange(0,
+                                     self._num_decodes + 1,
+                                     step=1,
+                                     dtype=torch.int32,
+                                     device=device)
 
         attn_metadata = AiterMLADecodeMetadata(
             block_table=block_table_tensor,
             seq_lens=seq_lens,
             paged_kv_indptr=paged_kv_indptr,
             paged_kv_indices=paged_kv_indices,
-            paged_kv_last_page_len=paged_last_page_len,
+            paged_kv_last_page_len=paged_kv_last_page_len,
             qo_indptr=qo_indptr)
 
         return attn_metadata

From 657f2f301a431542a731719fa8c6326deacc317d Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Wed, 2 Jul 2025 18:21:52 +0100
Subject: [PATCH 055/167] [DP] Support external DP Load Balancer mode (#19790)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 .buildkite/test-pipeline.yaml              |   6 +-
 tests/v1/engine/test_engine_core_client.py |   6 +-
 tests/v1/test_external_lb_dp.py            | 312 ++++++++++++
 vllm/config.py                             |  13 +
 vllm/engine/arg_utils.py                   |  28 +-
 vllm/entrypoints/cli/serve.py              | 171 +++----
 vllm/v1/engine/coordinator.py              |  48 +-
 vllm/v1/engine/core.py                     | 149 ++++--
 vllm/v1/engine/core_client.py              | 360 +++++---------
 vllm/v1/engine/utils.py                    | 546 +++++++++++++++++++++
 vllm/v1/utils.py                           | 402 +--------------
 11 files changed, 1254 insertions(+), 787 deletions(-)
 create mode 100644 tests/v1/test_external_lb_dp.py
 create mode 100644 vllm/v1/engine/utils.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index a13e2cb782182..175269e857e01 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -155,6 +155,7 @@ steps:
   - examples/offline_inference/rlhf_colocate.py
   - tests/examples/offline_inference/data_parallel.py
   - tests/v1/test_async_llm_dp.py
+  - tests/v1/test_external_lb_dp.py
   - tests/v1/engine/test_engine_core_client.py
   commands:
   # test with tp=2 and external_dp=2
@@ -163,8 +164,9 @@ steps:
   # test with tp=2 and pp=2
   - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
   # test with internal dp
-  - python3 ../examples/offline_inference/data_parallel.py
+  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
   - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
   - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
@@ -682,10 +684,12 @@ steps:
   - vllm/worker/model_runner.py
   - entrypoints/llm/test_collective_rpc.py
   - tests/v1/test_async_llm_dp.py
+  - tests/v1/test_external_lb_dp.py
   - tests/v1/entrypoints/openai/test_multi_api_servers.py
   - vllm/v1/engine/
   commands:
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
   - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
   - pytest -v -s entrypoints/llm/test_collective_rpc.py
   - pytest -v -s ./compile/test_basic_correctness.py
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index d5ff78c1449a6..65f1da803fb2e 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -26,8 +26,8 @@ from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core import EngineCore
 from vllm.v1.engine.core_client import (AsyncMPClient, EngineCoreClient,
                                         SyncMPClient)
+from vllm.v1.engine.utils import CoreEngineProcManager
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.utils import CoreEngineProcManager
 
 from ...distributed.conftest import MockSubscriber
 from ...utils import create_new_process_for_each_test
@@ -563,7 +563,7 @@ def test_engine_core_proc_instantiation_cuda_empty(
         m.setenv("VLLM_USE_V1", "1")
         m.setenv("CUDA_VISIBLE_DEVICES", "")  # No CUDA devices
 
-        from vllm.v1.utils import EngineZmqAddresses
+        from vllm.v1.engine.utils import EngineZmqAddresses
 
         def mock_startup_handshake(self, handshake_socket, on_head_node,
                                    parallel_config):
@@ -580,7 +580,7 @@ def test_engine_core_proc_instantiation_cuda_empty(
             trust_remote_code=True).create_engine_config()
         engine_core_proc = EngineCoreProc(
             vllm_config=vllm_config,
-            on_head_node=True,
+            local_client=True,
             handshake_address="tcp://127.0.0.1:12345",
             executor_class=mock_executor_class,
             log_stats=False,
diff --git a/tests/v1/test_external_lb_dp.py b/tests/v1/test_external_lb_dp.py
new file mode 100644
index 0000000000000..17952dfb0d912
--- /dev/null
+++ b/tests/v1/test_external_lb_dp.py
@@ -0,0 +1,312 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import os
+import threading
+import time
+from contextlib import AsyncExitStack
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+from vllm.platforms import Platform
+
+MODEL_NAME = "ibm-research/PowerMoE-3b"
+
+# Number of data parallel ranks for external LB testing
+DP_SIZE = int(os.getenv("DP_SIZE", "2"))
+# Default tensor parallell size to use
+TP_SIZE = int(os.getenv("TP_SIZE", "1"))
+
+
+class ExternalLBServerManager:
+    """Manages data parallel vLLM server instances for external
+    load balancer testing."""
+
+    def __init__(self,
+                 model_name: str,
+                 dp_size: int,
+                 api_server_count: int,
+                 base_server_args: list,
+                 tp_size: int = TP_SIZE):
+        self.model_name = model_name
+        self.dp_size = dp_size
+        self.tp_size = tp_size
+        self.api_server_count = api_server_count
+        self.base_server_args = base_server_args
+        self.servers: list[tuple[RemoteOpenAIServer, list[str]]] = []
+        self.server_threads: list[threading.Thread] = []
+
+    def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]:
+        """Start all server instances for external LB mode."""
+        for rank in range(self.dp_size):
+            # Create server args for this specific rank
+            server_args = self.base_server_args.copy()
+
+            # Add external LB specific arguments
+            server_args.extend([
+                "--data-parallel-size",
+                str(self.dp_size),
+                "--data-parallel-rank",
+                str(rank),
+                "--data-parallel-size-local",
+                "1",
+                "--tensor-parallel-size",
+                str(self.tp_size),
+                "--port",
+                str(8000 + rank),  # Different port for each rank
+                "--api-server-count",
+                str(self.api_server_count),
+            ])
+
+            # Use a thread to start each server to allow parallel initialization
+            def start_server(r: int, sargs: list[str]):
+                try:
+                    # Start the server
+                    server = RemoteOpenAIServer(
+                        self.model_name,
+                        sargs,
+                        auto_port=False,
+                        env_dict={
+                            "CUDA_VISIBLE_DEVICES":
+                            ",".join(
+                                str(Platform.device_id_to_physical_device_id(
+                                    i))
+                                for i in range(r * TP_SIZE, (r + 1) * TP_SIZE))
+                        })
+                    server.__enter__()
+                    print(f"Server rank {r} started successfully with "
+                          f"{self.api_server_count} API servers")
+                    self.servers.append((server, sargs))
+                except Exception as e:
+                    print(f"Failed to start server rank {r}: {e}")
+                    raise
+
+            thread = threading.Thread(target=start_server,
+                                      args=(rank, server_args))
+            thread.start()
+
+            self.server_threads.append(thread)
+
+        # Wait for all servers to start
+        for thread in self.server_threads:
+            thread.join()
+
+        # Give servers additional time to fully initialize and coordinate
+        time.sleep(2)
+
+        if len(self.servers) != self.dp_size:
+            raise Exception("Servers failed to start")
+
+        return self.servers
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Stop all server instances."""
+        while self.servers:
+            try:
+                self.servers.pop()[0].__exit__(exc_type, exc_val, exc_tb)
+            except Exception as e:
+                print(f"Error stopping server: {e}")
+
+
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+    ]
+
+
+@pytest.fixture(scope="module", params=[1, 4])
+def servers(request, default_server_args):
+    api_server_count = request.param
+    with ExternalLBServerManager(MODEL_NAME, DP_SIZE, api_server_count,
+                                 default_server_args) as server_list:
+        yield server_list
+
+
+@pytest_asyncio.fixture
+async def clients(servers: list[tuple[RemoteOpenAIServer, list[str]]]):
+    # Create a client for each server
+    async with AsyncExitStack() as stack:
+        yield [
+            await stack.enter_async_context(server.get_async_client())
+            for server, _ in servers
+        ]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_external_lb_single_completion(clients: list[
+    openai.AsyncOpenAI], servers: list[tuple[RemoteOpenAIServer, list[str]]],
+                                             model_name: str) -> None:
+
+    async def make_request(client: openai.AsyncOpenAI):
+        completion = await client.completions.create(
+            model=model_name,
+            prompt="Hello, my name is",
+            max_tokens=10,
+            temperature=1.0)
+
+        assert completion.id is not None
+        assert completion.choices is not None and len(completion.choices) == 1
+
+        choice = completion.choices[0]
+        # The exact number of tokens can vary slightly with temperature=1.0,
+        # so we check for a reasonable minimum length.
+        assert len(choice.text) >= 1
+        # Finish reason might not always be 'length' if the model finishes early
+        # or due to other reasons, especially with high temperature.
+        # So, we'll accept 'length' or 'stop'.
+        assert choice.finish_reason in ("length", "stop")
+
+        # Token counts can also vary, so we check they are positive.
+        assert completion.usage.completion_tokens > 0
+        assert completion.usage.prompt_tokens > 0
+        assert completion.usage.total_tokens > 0
+        return completion
+
+    # Test single request to each server
+    for i, client in enumerate(clients):
+        result = await make_request(client)
+        assert result is not None
+        print(f"Server {i} handled single completion request successfully")
+
+    await asyncio.sleep(0.5)
+
+    # Send requests to all servers in round-robin fashion
+    num_requests_per_server = 25  # Total 50 requests across 2 servers
+    all_tasks = []
+
+    for i, client in enumerate(clients):
+        tasks = [make_request(client) for _ in range(num_requests_per_server)]
+        all_tasks.extend(tasks)
+
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests_per_server * len(clients)
+    assert all(completion is not None for completion in results)
+
+    await asyncio.sleep(0.5)
+
+    # Second burst of requests
+    all_tasks = []
+    for i, client in enumerate(clients):
+        tasks = [make_request(client) for _ in range(num_requests_per_server)]
+        all_tasks.extend(tasks)
+
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests_per_server * len(clients)
+    assert all(completion is not None for completion in results)
+
+    _, server_args = servers[0]
+    api_server_count = (
+        server_args.count('--api-server-count')
+        and server_args[server_args.index('--api-server-count') + 1] or 1)
+    print(
+        f"Successfully completed external LB test with {len(clients)} servers "
+        f"(API server count: {api_server_count})")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_external_lb_completion_streaming(clients: list[
+    openai.AsyncOpenAI], servers: list[tuple[RemoteOpenAIServer, list[str]]],
+                                                model_name: str) -> None:
+    prompt = "What is an LLM?"
+
+    async def make_streaming_request(client: openai.AsyncOpenAI):
+        # Perform a non-streaming request to get the expected full output
+        single_completion = await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+        )
+        single_output = single_completion.choices[0].text
+
+        # Perform the streaming request
+        stream = await client.completions.create(model=model_name,
+                                                 prompt=prompt,
+                                                 max_tokens=5,
+                                                 temperature=0.0,
+                                                 stream=True)
+        chunks: list[str] = []
+        finish_reason_count = 0
+        last_chunk = None
+        async for chunk in stream:
+            chunks.append(chunk.choices[0].text)
+            if chunk.choices[0].finish_reason is not None:
+                finish_reason_count += 1
+            last_chunk = chunk  # Keep track of the last chunk
+
+        # finish reason should only return in the last block for OpenAI API
+        assert finish_reason_count == 1, (
+            "Finish reason should appear exactly once.")
+        assert last_chunk is not None, (
+            "Stream should have yielded at least one chunk.")
+        assert last_chunk.choices[
+            0].finish_reason == "length", "Finish reason should be 'length'."
+        # Check that the combined text matches the non-streamed version.
+        assert "".join(
+            chunks
+        ) == single_output, "Streamed output should match non-streamed output."
+        return True  # Indicate success for this request
+
+    # Test single request to each server
+    for i, client in enumerate(clients):
+        result = await make_streaming_request(client)
+        assert result is not None
+        print(f"Server {i} handled single streaming request successfully")
+
+    await asyncio.sleep(0.5)
+
+    # Send streaming requests to all servers in round-robin fashion
+    num_requests_per_server = 25  # Total 50 requests across 2 servers
+    all_tasks = []
+
+    for i, client in enumerate(clients):
+        tasks = [
+            make_streaming_request(client)
+            for _ in range(num_requests_per_server)
+        ]
+        all_tasks.extend(tasks)
+
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests_per_server * len(clients)
+    assert all(results), "Not all streaming requests completed successfully."
+
+    await asyncio.sleep(0.5)
+
+    # Second burst of streaming requests
+    all_tasks = []
+    for i, client in enumerate(clients):
+        tasks = [
+            make_streaming_request(client)
+            for _ in range(num_requests_per_server)
+        ]
+        all_tasks.extend(tasks)
+
+    results = await asyncio.gather(*all_tasks)
+    assert len(results) == num_requests_per_server * len(clients)
+    assert all(results), "Not all streaming requests completed successfully."
+
+    _, server_args = servers[0]
+    api_server_count = (
+        server_args.count('--api-server-count')
+        and server_args[server_args.index('--api-server-count') + 1] or 1)
+    print(f"Successfully completed external LB streaming test with "
+          f"{len(clients)} servers (API server count: {api_server_count})")
diff --git a/vllm/config.py b/vllm/config.py
index 7863859a6ee67..0ec4281ae3c10 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1784,6 +1784,10 @@ class ParallelConfig:
     """Port of the data parallel master."""
     data_parallel_backend: str = "mp"
     """Backend to use for data parallel, either "mp" or "ray"."""
+    data_parallel_external_lb: bool = False
+    """Whether to use "external" DP LB mode. Applies only to online serving
+    and when data_parallel_size > 0. Set implicitly when
+    data_parallel_rank is provided explicitly to vllm serve."""
     enable_expert_parallel: bool = False
     """Use expert parallelism instead of tensor parallelism for MoE layers."""
     enable_eplb: bool = False
@@ -1953,6 +1957,11 @@ class ParallelConfig:
         if self.data_parallel_size > 1 or self.data_parallel_size_local == 0:
             # Data parallel was specified in the engine args.
             self.data_parallel_master_port = get_open_port()
+
+            if not (0 <= self.data_parallel_rank < self.data_parallel_size):
+                raise ValueError(
+                    f"data_parallel_rank ({self.data_parallel_rank})"
+                    f" must be in the range [0, {self.data_parallel_size})")
         else:
             # Otherwise fall back to env vars (e.g. for offline SPMD case).
             self.data_parallel_size = envs.VLLM_DP_SIZE
@@ -1961,6 +1970,10 @@ class ParallelConfig:
             self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
             self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
 
+            if self.data_parallel_external_lb:
+                raise ValueError("data_parallel_external_lb can only "
+                                 "be set when data_parallel_size > 1")
+
         if self.distributed_executor_backend == "external_launcher":
             import os
             os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 38f82e64de537..11dbc23a743b2 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -318,6 +318,7 @@ class EngineArgs:
     pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
     tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
     data_parallel_size: int = ParallelConfig.data_parallel_size
+    data_parallel_rank: Optional[int] = None
     data_parallel_size_local: Optional[int] = None
     data_parallel_address: Optional[str] = None
     data_parallel_rpc_port: Optional[int] = None
@@ -655,6 +656,12 @@ class EngineArgs:
                                     **parallel_kwargs["tensor_parallel_size"])
         parallel_group.add_argument("--data-parallel-size", "-dp",
                                     **parallel_kwargs["data_parallel_size"])
+        parallel_group.add_argument(
+            '--data-parallel-rank',
+            '-dpn',
+            type=int,
+            help='Data parallel rank of this instance. '
+            'When set, enables external load balancer mode.')
         parallel_group.add_argument('--data-parallel-size-local',
                                     '-dpl',
                                     type=int,
@@ -1126,10 +1133,17 @@ class EngineArgs:
             # but we should not do this here.
             placement_group = ray.util.get_current_placement_group()
 
-        # Local DP size defaults to global DP size if not set.
-        data_parallel_size_local = self.data_parallel_size if (
-            self.data_parallel_size_local
-            is None) else self.data_parallel_size_local
+        data_parallel_external_lb = self.data_parallel_rank is not None
+        if data_parallel_external_lb:
+            assert self.data_parallel_size_local in (1, None), (
+                "data_parallel_size_local must be 1 when data_parallel_rank "
+                "is set")
+            data_parallel_size_local = 1
+        elif self.data_parallel_size_local is not None:
+            data_parallel_size_local = self.data_parallel_size_local
+        else:
+            # Local DP size defaults to global DP size if not set.
+            data_parallel_size_local = self.data_parallel_size
 
         # DP address, used in multi-node case for torch distributed group
         # and ZMQ sockets.
@@ -1154,16 +1168,16 @@ class EngineArgs:
             self.data_parallel_rpc_port
             is not None) else ParallelConfig.data_parallel_rpc_port
 
-        data_parallel_backend = self.data_parallel_backend
-
         parallel_config = ParallelConfig(
             pipeline_parallel_size=self.pipeline_parallel_size,
             tensor_parallel_size=self.tensor_parallel_size,
             data_parallel_size=self.data_parallel_size,
+            data_parallel_rank=self.data_parallel_rank or 0,
+            data_parallel_external_lb=data_parallel_external_lb,
             data_parallel_size_local=data_parallel_size_local,
             data_parallel_master_ip=data_parallel_address,
             data_parallel_rpc_port=data_parallel_rpc_port,
-            data_parallel_backend=data_parallel_backend,
+            data_parallel_backend=self.data_parallel_backend,
             enable_expert_parallel=self.enable_expert_parallel,
             enable_eplb=self.enable_eplb,
             num_redundant_experts=self.num_redundant_experts,
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index 897c222a3ff58..2ca31510208df 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -5,9 +5,9 @@ import argparse
 import os
 import signal
 import sys
+from typing import Optional
 
 import uvloop
-import zmq
 
 import vllm
 import vllm.envs as envs
@@ -21,17 +21,13 @@ from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG,
 from vllm.executor.multiproc_worker_utils import _add_prefix
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser, get_tcp_uri, zmq_socket_ctx
-from vllm.v1.engine.coordinator import DPCoordinator
+from vllm.utils import FlexibleArgumentParser, get_tcp_uri
 from vllm.v1.engine.core import EngineCoreProc
-from vllm.v1.engine.core_client import CoreEngineProcManager
+from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
-from vllm.v1.utils import (APIServerProcessManager, CoreEngine,
-                           CoreEngineActorManager, EngineZmqAddresses,
-                           get_engine_client_zmq_addr,
-                           wait_for_completion_or_failure,
-                           wait_for_engine_startup)
+from vllm.v1.utils import (APIServerProcessManager,
+                           wait_for_completion_or_failure)
 
 logger = init_logger(__name__)
 
@@ -48,11 +44,15 @@ class ServeSubcommand(CLISubcommand):
 
         if args.headless or args.api_server_count < 1:
             run_headless(args)
-        elif args.api_server_count > 1:
-            run_multi_api_server(args)
         else:
-            # Single API server (this process).
-            uvloop.run(run_server(args))
+            if args.data_parallel_start_rank:
+                raise ValueError("data_parallel_start_rank is only "
+                                 "applicable in headless mode")
+            if args.api_server_count > 1:
+                run_multi_api_server(args)
+            else:
+                # Single API server (this process).
+                uvloop.run(run_server(args))
 
     def validate(self, args: argparse.Namespace) -> None:
         validate_parsed_serve_args(args)
@@ -121,14 +121,19 @@ def run_headless(args: argparse.Namespace):
 
     parallel_config = vllm_config.parallel_config
     local_engine_count = parallel_config.data_parallel_size_local
-    host = parallel_config.data_parallel_master_ip
-    port = engine_args.data_parallel_rpc_port  # add to config too
-    handshake_address = get_tcp_uri(host, port)
 
     if local_engine_count <= 0:
         raise ValueError("data_parallel_size_local must be > 0 in "
                          "headless mode")
 
+    if parallel_config.data_parallel_rank is not None:
+        raise ValueError("data_parallel_rank is not applicable in "
+                         "headless mode")
+
+    host = parallel_config.data_parallel_master_ip
+    port = engine_args.data_parallel_rpc_port  # add to config too
+    handshake_address = get_tcp_uri(host, port)
+
     # Catch SIGTERM and SIGINT to allow graceful shutdown.
     def signal_handler(signum, frame):
         logger.debug("Received %d signal.", signum)
@@ -148,7 +153,7 @@ def run_headless(args: argparse.Namespace):
         start_index=args.data_parallel_start_rank,
         local_start_index=0,
         vllm_config=vllm_config,
-        on_head_node=False,
+        local_client=False,
         handshake_address=handshake_address,
         executor_class=Executor.get_class(vllm_config),
         log_stats=not engine_args.disable_log_stats,
@@ -192,117 +197,53 @@ def run_multi_api_server(args: argparse.Namespace):
                 " api_server_count > 1")
             model_config.disable_mm_preprocessor_cache = True
 
+    executor_class = Executor.get_class(vllm_config)
+    log_stats = not engine_args.disable_log_stats
+
     parallel_config = vllm_config.parallel_config
+    dp_rank = parallel_config.data_parallel_rank
+    external_dp_lb = parallel_config.data_parallel_external_lb
+    assert external_dp_lb or dp_rank == 0
 
-    assert parallel_config.data_parallel_rank == 0
+    api_server_manager: Optional[APIServerProcessManager] = None
 
-    dp_size = parallel_config.data_parallel_size
-    local_engine_count = parallel_config.data_parallel_size_local
-    host = parallel_config.data_parallel_master_ip
-    local_only = local_engine_count == dp_size
+    with launch_core_engines(vllm_config, executor_class, log_stats,
+                             num_api_servers) as (local_engine_manager,
+                                                  coordinator, addresses):
 
-    # Set up input and output addresses.
-    input_addresses = [
-        get_engine_client_zmq_addr(local_only, host)
-        for _ in range(num_api_servers)
-    ]
-    output_addresses = [
-        get_engine_client_zmq_addr(local_only, host)
-        for _ in range(num_api_servers)
-    ]
-
-    addresses = EngineZmqAddresses(
-        inputs=input_addresses,
-        outputs=output_addresses,
-    )
-
-    # Set up coordinator for dp > 1.
-    coordinator = None
-    stats_update_address = None
-    if dp_size > 1:
-        coordinator = DPCoordinator(parallel_config)
-        addresses.coordinator_input, addresses.coordinator_output = (
-            coordinator.get_engine_socket_addresses())
-        stats_update_address = coordinator.get_stats_publish_address()
-        logger.info("Started DP Coordinator process (PID: %d)",
-                    coordinator.proc.pid)
-
-    if parallel_config.data_parallel_backend == "ray":
-        logger.info("Starting ray-based data parallel backend")
-
-        engine_actor_manager = CoreEngineActorManager(
-            vllm_config=vllm_config,
-            addresses=addresses,
-            executor_class=Executor.get_class(vllm_config),
-            log_stats=not engine_args.disable_log_stats,
-        )
-        # Start API servers using the manager
-        api_server_manager = APIServerProcessManager(
+        # Construct common args for the APIServerProcessManager up-front.
+        api_server_manager_kwargs = dict(
             target_server_fn=run_api_server_worker_proc,
             listen_address=listen_address,
             sock=sock,
             args=args,
             num_servers=num_api_servers,
-            input_addresses=input_addresses,
-            output_addresses=output_addresses,
-            stats_update_address=stats_update_address)
+            input_addresses=addresses.inputs,
+            output_addresses=addresses.outputs,
+            stats_update_address=coordinator.get_stats_publish_address()
+            if coordinator else None)
 
-        wait_for_completion_or_failure(api_server_manager=api_server_manager,
-                                       engine_manager=engine_actor_manager,
-                                       coordinator=coordinator)
-        return
+        # For dp ranks > 0 in external DP LB mode, we must delay the
+        # start of the API servers until the local engine is started
+        # (after the launcher context manager exits),
+        # since we get the front-end stats update address from the coordinator
+        # via the handshake with the local engine.
+        if dp_rank == 0 or not external_dp_lb:
+            # Start API servers using the manager.
+            api_server_manager = APIServerProcessManager(
+                **api_server_manager_kwargs)
 
-    handshake_address = get_engine_client_zmq_addr(
-        local_only, host, parallel_config.data_parallel_rpc_port)
-
-    with zmq_socket_ctx(handshake_address, zmq.ROUTER,
-                        bind=True) as handshake_socket:
-
-        # Start local engines.
-        if not local_engine_count:
-            local_engine_manager = None
-        else:
-            local_engine_manager = CoreEngineProcManager(
-                EngineCoreProc.run_engine_core,
-                vllm_config=vllm_config,
-                executor_class=Executor.get_class(vllm_config),
-                log_stats=not engine_args.disable_log_stats,
-                handshake_address=handshake_address,
-                on_head_node=True,
-                local_engine_count=local_engine_count,
-                start_index=0,
-                local_start_index=0)
-
-        # Start API servers using the manager
+    # Start API servers now if they weren't already started.
+    if api_server_manager is None:
+        api_server_manager_kwargs["stats_update_address"] = (
+            addresses.frontend_stats_publish_address)
         api_server_manager = APIServerProcessManager(
-            target_server_fn=run_api_server_worker_proc,
-            listen_address=listen_address,
-            sock=sock,
-            args=args,
-            num_servers=num_api_servers,
-            input_addresses=input_addresses,
-            output_addresses=output_addresses,
-            stats_update_address=stats_update_address)
+            **api_server_manager_kwargs)
 
-        # Wait for engine handshakes to complete.
-        core_engines = [
-            CoreEngine(index=i, local=(i < local_engine_count))
-            for i in range(dp_size)
-        ]
-        wait_for_engine_startup(
-            handshake_socket,
-            addresses,
-            core_engines,
-            parallel_config,
-            vllm_config.cache_config,
-            local_engine_manager,
-            coordinator.proc if coordinator else None,
-        )
-
-        # Wait for API servers
-        wait_for_completion_or_failure(api_server_manager=api_server_manager,
-                                       engine_manager=local_engine_manager,
-                                       coordinator=coordinator)
+    # Wait for API servers
+    wait_for_completion_or_failure(api_server_manager=api_server_manager,
+                                   engine_manager=local_engine_manager,
+                                   coordinator=coordinator)
 
 
 def run_api_server_worker_proc(listen_address,
diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py
index 031e9b85f24c6..b3e7a2e85b80b 100644
--- a/vllm/v1/engine/coordinator.py
+++ b/vllm/v1/engine/coordinator.py
@@ -10,7 +10,7 @@ import zmq
 
 from vllm.config import ParallelConfig
 from vllm.logger import init_logger
-from vllm.utils import get_mp_context, get_open_zmq_ipc_path, make_zmq_socket
+from vllm.utils import get_mp_context, make_zmq_socket
 from vllm.v1.engine import EngineCoreOutputs, EngineCoreRequestType
 from vllm.v1.serial_utils import MsgpackDecoder
 from vllm.v1.utils import get_engine_client_zmq_addr, shutdown
@@ -48,20 +48,33 @@ class DPCoordinator:
 
     Engines will move into running state when receiving a new request or
     START_DP_WAVE message.
+
+    Note that when deployed in External LB mode, no stats will be published by
+    the engines and thus updates will only be sent to front-ends when the
+    request wave / running state changes.
     """
 
     def __init__(self, parallel_config: ParallelConfig):
 
-        # Assume coordinator is colocated with front-end procs.
-        front_publish_address = get_open_zmq_ipc_path()
-
         dp_size = parallel_config.data_parallel_size
         assert dp_size > 1, "Coordinator only used for data parallel"
 
-        local_only = dp_size == parallel_config.data_parallel_size_local
         host = parallel_config.data_parallel_master_ip
-        back_publish_address = get_engine_client_zmq_addr(local_only, host)
-        back_output_address = get_engine_client_zmq_addr(local_only, host)
+        external_lb = parallel_config.data_parallel_external_lb
+
+        # Assume coordinator is colocated with front-end procs when not in
+        # external DP LB mode.
+        front_publish_address = get_engine_client_zmq_addr(
+            local_only=not external_lb, host=host)
+
+        local_only_eng = dp_size == parallel_config.data_parallel_size_local
+        back_publish_address = get_engine_client_zmq_addr(local_only_eng, host)
+        back_output_address = get_engine_client_zmq_addr(local_only_eng, host)
+
+        # When in external LB mode, load stats aren't published, only changes
+        # to request wave / running state, so we don't need to rate-limit the
+        # updates to the front-end proc(s).
+        min_stats_update_interval_ms = 0 if external_lb else 100
 
         context = get_mp_context()
         self.proc: multiprocessing.Process = context.Process(
@@ -72,6 +85,7 @@ class DPCoordinator:
                 "front_publish_address": front_publish_address,
                 "back_output_address": back_output_address,
                 "back_publish_address": back_publish_address,
+                "min_stats_update_interval_ms": min_stats_update_interval_ms,
             },
             daemon=True)
         self.proc.start()
@@ -100,12 +114,16 @@ class EngineState:
 
 class CoordinatorProc:
 
-    def __init__(self, engine_count: int):
+    def __init__(self,
+                 engine_count: int,
+                 min_stats_update_interval_ms: int = 100):
 
         self.ctx = zmq.Context()
 
         self.engines = [EngineState() for _ in range(engine_count)]
 
+        self.stats_update_interval_ms = min_stats_update_interval_ms
+
         self.current_wave = 0
         self.engines_running = False
         self.stats_changed = False
@@ -116,8 +134,11 @@ class CoordinatorProc:
         front_publish_address: str,
         back_output_address: str,
         back_publish_address: str,
+        min_stats_update_interval_ms: int = 100,
     ):
-        coordinator = CoordinatorProc(engine_count=engine_count)
+        coordinator = CoordinatorProc(
+            engine_count=engine_count,
+            min_stats_update_interval_ms=min_stats_update_interval_ms)
         try:
             coordinator.process_input_socket(
                 front_publish_address,
@@ -156,9 +177,10 @@ class CoordinatorProc:
             last_publish_time = 0
             while True:
                 elapsed = int(time.time() * 1000) - last_publish_time
-                # Send at 100 ms interval if the stats have changed,
-                # or otherwise every 3 seconds.
-                wait_for = 100 if self.stats_changed else 3000
+                # Send at stats_update_interval_ms interval if the stats have
+                # changed, or otherwise every 4 seconds.
+                wait_for = (self.stats_update_interval_ms
+                            if self.stats_changed else 4000)
                 events = poller.poll(timeout=max(0, wait_for - elapsed))
                 if not events:
                     # Poller timeout - publish current stats to front-ends.
@@ -174,7 +196,7 @@ class CoordinatorProc:
 
                 if publish_front in events:
                     buffer = publish_front.recv()
-                    if buffer == b'\x01':
+                    if buffer in (b'\x01', b'\x00'):
                         # Ignore subscription messages.
                         continue
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 453ed364dc81b..e2fdf6f8a11c7 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -34,6 +34,7 @@ from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType, UtilityOutput)
 from vllm.v1.engine.mm_input_cache import MirroredProcessingCache
+from vllm.v1.engine.utils import EngineHandshakeMetadata, EngineZmqAddresses
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import SchedulerStats
@@ -41,7 +42,6 @@ from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
 from vllm.v1.structured_output import StructuredOutputManager
-from vllm.v1.utils import EngineHandshakeMetadata, EngineZmqAddresses
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
@@ -367,10 +367,11 @@ class EngineCoreProc(EngineCore):
     def __init__(
         self,
         vllm_config: VllmConfig,
-        on_head_node: bool,
+        local_client: bool,
         handshake_address: str,
         executor_class: type[Executor],
         log_stats: bool,
+        client_handshake_address: Optional[str] = None,
         engine_index: int = 0,
     ):
         self.input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]()
@@ -383,12 +384,21 @@ class EngineCoreProc(EngineCore):
         identity = self.engine_index.to_bytes(length=2, byteorder="little")
         self.engines_running = False
 
-        with self._perform_handshake(handshake_address, identity, on_head_node,
-                                     vllm_config) as addresses:
+        with self._perform_handshakes(handshake_address, identity,
+                                      local_client, vllm_config,
+                                      client_handshake_address) as addresses:
             self.client_count = len(addresses.outputs)
 
             # Set up data parallel environment.
             self.has_coordinator = addresses.coordinator_output is not None
+            self.frontend_stats_publish_address = (
+                addresses.frontend_stats_publish_address)
+            # Only publish request queue stats to coordinator for "internal"
+            # LB mode.
+            self.publish_dp_lb_stats = (
+                self.has_coordinator
+                and not vllm_config.parallel_config.data_parallel_external_lb)
+
             self._init_data_parallel(vllm_config)
 
             super().__init__(vllm_config, executor_class, log_stats,
@@ -414,45 +424,102 @@ class EngineCoreProc(EngineCore):
         self.output_thread.start()
 
     @contextmanager
-    def _perform_handshake(
-            self, handshake_address: str, identity: bytes, on_head_node: bool,
-            vllm_config: VllmConfig
+    def _perform_handshakes(
+        self,
+        handshake_address: str,
+        identity: bytes,
+        local_client: bool,
+        vllm_config: VllmConfig,
+        client_handshake_address: Optional[str],
     ) -> Generator[EngineZmqAddresses, None, None]:
+        """
+        Perform startup handshakes.
+
+        For DP=1 or offline mode, this is with the colocated front-end process.
+
+        For DP>1 with internal loadbalancing this is with the shared front-end
+        process which may reside on a different node.
+
+        For DP>1 with external loadbalancing, two handshakes are performed:
+            - With the rank 0 front-end process which retrieves the
+              DP Coordinator ZMQ addresses and DP process group address.
+            - With the colocated front-end process which retrieves the
+              client input/output socket addresses.
+        with the exception of the rank 0 engine itself which doesn't require
+        the second handshake.
+
+        Here, "front-end" process can mean the process containing the engine
+        core client (which is the API server process in the case the API
+        server is not scaled out), OR the launcher process running the
+        run_multi_api_server() function in serve.py.
+        """
         input_ctx = zmq.Context()
-        with make_zmq_socket(input_ctx,
+        is_local = local_client and client_handshake_address is None
+        handshake = self._perform_handshake(input_ctx, handshake_address,
+                                            identity, is_local, vllm_config,
+                                            vllm_config.parallel_config)
+        if client_handshake_address is None:
+            with handshake as addresses:
+                yield addresses
+        else:
+            local_handshake = self._perform_handshake(
+                input_ctx, client_handshake_address, identity, local_client,
+                vllm_config)
+            with handshake as addresses, local_handshake as client_addresses:
+                addresses.inputs = client_addresses.inputs
+                addresses.outputs = client_addresses.outputs
+                yield addresses
+
+        # Update config which may have changed from the handshake
+        vllm_config.__post_init__()
+
+    @contextmanager
+    def _perform_handshake(
+        self,
+        ctx: zmq.Context,
+        handshake_address: str,
+        identity: bytes,
+        local_client: bool,
+        vllm_config: VllmConfig,
+        parallel_config_to_update: Optional[ParallelConfig] = None,
+    ) -> Generator[EngineZmqAddresses, None, None]:
+        with make_zmq_socket(ctx,
                              handshake_address,
                              zmq.DEALER,
                              identity=identity,
                              linger=5000,
                              bind=False) as handshake_socket:
             # Register engine with front-end.
-            addresses = self.startup_handshake(handshake_socket, on_head_node,
-                                               vllm_config.parallel_config)
-
-            # Update config which may have changed from the handshake
-            vllm_config.__post_init__()
-
+            addresses = self.startup_handshake(handshake_socket, local_client,
+                                               parallel_config_to_update)
             yield addresses
 
             # Send ready message.
             num_gpu_blocks = vllm_config.cache_config.num_gpu_blocks
+            # We pass back the coordinator stats update address here for the
+            # external LB case for our colocated front-end to use (coordinator
+            # only runs with rank 0).
+            dp_stats_address = self.frontend_stats_publish_address
             handshake_socket.send(
                 msgspec.msgpack.encode({
                     "status": "READY",
-                    "local": on_head_node,
+                    "local": local_client,
                     "num_gpu_blocks": num_gpu_blocks,
+                    "dp_stats_address": dp_stats_address,
                 }))
 
     @staticmethod
     def startup_handshake(
-            handshake_socket: zmq.Socket, on_head_node: bool,
-            parallel_config: ParallelConfig) -> EngineZmqAddresses:
+        handshake_socket: zmq.Socket,
+        local_client: bool,
+        parallel_config: Optional[ParallelConfig] = None,
+    ) -> EngineZmqAddresses:
 
         # Send registration message.
         handshake_socket.send(
             msgspec.msgpack.encode({
                 "status": "HELLO",
-                "local": on_head_node,
+                "local": local_client,
             }))
 
         # Receive initialization message.
@@ -466,9 +533,9 @@ class EngineCoreProc(EngineCore):
             init_bytes, type=EngineHandshakeMetadata)
         logger.debug("Received init message: %s", init_message)
 
-        received_parallel_config = init_message.parallel_config
-        for key, value in received_parallel_config.items():
-            setattr(parallel_config, key, value)
+        if parallel_config is not None:
+            for key, value in init_message.parallel_config.items():
+                setattr(parallel_config, key, value)
 
         return init_message.addresses
 
@@ -749,12 +816,12 @@ class DPEngineCoreProc(EngineCoreProc):
     def __init__(
         self,
         vllm_config: VllmConfig,
-        on_head_node: bool,
+        local_client: bool,
         handshake_address: str,
         executor_class: type[Executor],
         log_stats: bool,
+        client_handshake_address: Optional[str] = None,
     ):
-
         self._decorate_logs()
 
         # Counts forward-passes of the model so that we can synchronize
@@ -765,8 +832,9 @@ class DPEngineCoreProc(EngineCoreProc):
 
         # Initialize the engine.
         dp_rank = vllm_config.parallel_config.data_parallel_rank
-        super().__init__(vllm_config, on_head_node, handshake_address,
-                         executor_class, log_stats, dp_rank)
+        super().__init__(vllm_config, local_client, handshake_address,
+                         executor_class, log_stats, client_handshake_address,
+                         dp_rank)
 
     def _decorate_logs(self):
         # Add process-specific prefix to stdout and stderr before
@@ -799,10 +867,18 @@ class DPEngineCoreProc(EngineCoreProc):
         from vllm.platforms import current_platform
         device_control_env_var = current_platform.device_control_env_var
         world_size = vllm_config.parallel_config.world_size
-        os.environ[device_control_env_var] = ",".join(
-            str(current_platform.device_id_to_physical_device_id(i))
-            for i in range(local_dp_rank * world_size, (local_dp_rank + 1) *
-                           world_size))
+        # Set CUDA_VISIBLE_DEVICES or equivalent.
+        try:
+            os.environ[device_control_env_var] = ",".join(
+                str(current_platform.device_id_to_physical_device_id(i))
+                for i in range(local_dp_rank *
+                               world_size, (local_dp_rank + 1) * world_size))
+        except IndexError as e:
+            raise Exception(
+                f"Error setting {device_control_env_var}: "
+                f"local range: [{local_dp_rank * world_size}, "
+                f"{(local_dp_rank + 1) * world_size}) "
+                f"base value: \"{os.getenv(device_control_env_var)}\"") from e
 
         self.dp_rank = dp_rank
         self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()
@@ -839,7 +915,7 @@ class DPEngineCoreProc(EngineCoreProc):
             super()._handle_client_request(request_type, request)
 
     def _maybe_publish_request_counts(self):
-        if not self.has_coordinator:
+        if not self.publish_dp_lb_stats:
             return
 
         # Publish our request counts (if they've changed).
@@ -892,9 +968,9 @@ class DPEngineCoreProc(EngineCoreProc):
 
     def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool:
 
-        # Optimization - only perform finish-sync all-reduce every 24 steps.
+        # Optimization - only perform finish-sync all-reduce every 32 steps.
         self.counter += 1
-        if self.counter != 24:
+        if self.counter != 32:
             return True
         self.counter = 0
 
@@ -910,7 +986,7 @@ class DPEngineCoreActor(DPEngineCoreProc):
     def __init__(
         self,
         vllm_config: VllmConfig,
-        on_head_node: bool,
+        local_client: bool,
         addresses: EngineZmqAddresses,
         executor_class: type[Executor],
         log_stats: bool,
@@ -927,15 +1003,16 @@ class DPEngineCoreActor(DPEngineCoreProc):
         # data parallel groups.
         del os.environ['CUDA_VISIBLE_DEVICES']
 
-        super().__init__(vllm_config, on_head_node, "", executor_class,
+        super().__init__(vllm_config, local_client, "", executor_class,
                          log_stats)
 
     def _decorate_logs(self):
         pass
 
     @contextmanager
-    def _perform_handshake(self, handshake_address: str, identity: bytes,
-                           on_head_node: bool, vllm_config: VllmConfig):
+    def _perform_handshakes(self, handshake_address: str, identity: bytes,
+                            local_client: bool, vllm_config: VllmConfig,
+                            client_handshake_address: Optional[str]):
         """
         For Ray, we don't need to actually perform handshake.
         All addresses information is known before the actor creation.
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 856310df58884..dafaa15f777d7 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -7,7 +7,7 @@ import sys
 import uuid
 import weakref
 from abc import ABC, abstractmethod
-from collections import deque
+from collections import defaultdict, deque
 from collections.abc import Awaitable, Sequence
 from concurrent.futures import Future
 from dataclasses import dataclass
@@ -21,18 +21,16 @@ import zmq.asyncio
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.utils import (get_open_zmq_inproc_path, make_zmq_socket,
-                        zmq_socket_ctx)
+from vllm.utils import get_open_zmq_inproc_path, make_zmq_socket
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType, UtilityOutput)
 from vllm.v1.engine.coordinator import DPCoordinator
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
 from vllm.v1.engine.exceptions import EngineDeadError
+from vllm.v1.engine.utils import (CoreEngineActorManager,
+                                  CoreEngineProcManager, launch_core_engines)
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder, bytestr
-from vllm.v1.utils import (CoreEngine, CoreEngineActorManager,
-                           CoreEngineProcManager, EngineZmqAddresses,
-                           get_engine_client_zmq_addr, wait_for_engine_startup)
 
 logger = init_logger(__name__)
 
@@ -40,6 +38,8 @@ AnyFuture = Union[asyncio.Future[Any], Future[Any]]
 
 _R = TypeVar('_R')  # Return type for collective_rpc
 
+EngineIdentity = bytes
+
 
 class EngineCoreClient(ABC):
     """
@@ -84,14 +84,16 @@ class EngineCoreClient(ABC):
         client_addresses: Optional[dict[str, str]] = None,
         client_index: int = 0,
     ) -> "MPClient":
-        if vllm_config.parallel_config.data_parallel_size > 1:
-            if vllm_config.parallel_config.data_parallel_backend == "ray":
-                return RayDPClient(vllm_config, executor_class, log_stats,
-                                   client_addresses, client_index)
-            return DPAsyncMPClient(vllm_config, executor_class, log_stats,
-                                   client_addresses, client_index)
-        return AsyncMPClient(vllm_config, executor_class, log_stats,
-                             client_addresses, client_index)
+        parallel_config = vllm_config.parallel_config
+        client_args = (vllm_config, executor_class, log_stats,
+                       client_addresses, client_index)
+        if parallel_config.data_parallel_size > 1:
+            if parallel_config.data_parallel_external_lb:
+                # External load balancer - client per DP rank.
+                return DPAsyncMPClient(*client_args)
+            # Internal load balancer - client balances to all DP ranks.
+            return DPLBAsyncMPClient(*client_args)
+        return AsyncMPClient(*client_args)
 
     @abstractmethod
     def shutdown(self):
@@ -386,42 +388,32 @@ class MPClient(EngineCoreClient):
         self._finalizer = weakref.finalize(self, self.resources)
         success = False
         try:
-            parallel_config = vllm_config.parallel_config
-            local_engine_count = parallel_config.data_parallel_size_local
-            local_start_index = parallel_config.data_parallel_rank_local
-            dp_size = parallel_config.data_parallel_size
-            dp_rank = parallel_config.data_parallel_rank
-
             # State used for data parallel.
             self.engines_running = False
 
-            # SPMD mode is where there is an LLM instance per DP rank and
-            # one core engine per LLM, see
-            # examples/offline_inference/data_parallel.py.
-            spmd_mode = local_start_index is not None
-            if spmd_mode:
-                assert local_engine_count == 1
-                self.core_engines = [CoreEngine(index=dp_rank, local=True)]
-            else:
-                assert dp_rank == 0
-                local_start_index = 0
-                self.core_engines = [
-                    CoreEngine(index=i, local=(i < local_engine_count))
-                    for i in range(dp_size)
-                ]
-
-            local_only = spmd_mode or local_engine_count == dp_size
-
             self.stats_update_address: Optional[str] = None
             if client_addresses is not None:
+                # Engines are managed externally to this client.
                 input_address = client_addresses["input_address"]
                 output_address = client_addresses["output_address"]
                 self.stats_update_address = client_addresses.get(
                     "stats_update_address")
             else:
-                host = parallel_config.data_parallel_master_ip
-                input_address = get_engine_client_zmq_addr(local_only, host)
-                output_address = get_engine_client_zmq_addr(local_only, host)
+                # Engines are managed by this client.
+                with launch_core_engines(vllm_config, executor_class,
+                                         log_stats) as (engine_manager,
+                                                        coordinator,
+                                                        addresses):
+                    self.resources.coordinator = coordinator
+                    self.resources.engine_manager = engine_manager
+
+                (input_address, ) = addresses.inputs
+                (output_address, ) = addresses.outputs
+                self.stats_update_address = (
+                    addresses.frontend_stats_publish_address)
+                if coordinator is not None:
+                    assert self.stats_update_address == (
+                        coordinator.get_stats_publish_address())
 
             # Create input and output sockets.
             self.input_socket = self.resources.input_socket = make_zmq_socket(
@@ -429,18 +421,24 @@ class MPClient(EngineCoreClient):
             self.resources.output_socket = make_zmq_socket(
                 self.ctx, output_address, zmq.PULL)
 
-            if client_addresses is None:
-                self._init_engines_direct(vllm_config, local_only,
-                                          local_start_index, input_address,
-                                          output_address, executor_class,
-                                          log_stats)
-                coordinator = self.resources.coordinator
-                if coordinator:
-                    self.stats_update_address = (
-                        coordinator.get_stats_publish_address())
+            parallel_config = vllm_config.parallel_config
+            dp_size = parallel_config.data_parallel_size
+            dp_rank = parallel_config.data_parallel_rank
+            external_dp_lb = parallel_config.data_parallel_external_lb
+
+            offline_mode = parallel_config.data_parallel_rank_local is not None
+            engine_ranks = [dp_rank] if (offline_mode
+                                         or external_dp_lb) else range(dp_size)
+            assert parallel_config.data_parallel_size_local <= len(
+                engine_ranks)
+
+            # ZMQ identity of each engine that this client will talk to.
+            self.core_engines: list[EngineIdentity] = [
+                index.to_bytes(2, "little") for index in engine_ranks
+            ]
 
             # Wait for ready messages from each engine on the input socket.
-            identities = set(e.identity for e in self.core_engines)
+            identities = set(self.core_engines)
             sync_input_socket = zmq.Socket.shadow(self.input_socket)
             while identities:
                 if not sync_input_socket.poll(timeout=600_000):
@@ -449,7 +447,7 @@ class MPClient(EngineCoreClient):
                 identity, _ = sync_input_socket.recv_multipart()
                 identities.remove(identity)
 
-            self.core_engine = self.core_engines[0]
+            self.core_engine: EngineIdentity = self.core_engines[0]
             self.utility_results: dict[int, AnyFuture] = {}
 
             # Request objects which may contain pytorch-allocated tensors
@@ -462,73 +460,6 @@ class MPClient(EngineCoreClient):
             if not success:
                 self._finalizer()
 
-    def _init_engines_direct(self, vllm_config: VllmConfig, local_only: bool,
-                             local_start_index: int, input_address: str,
-                             output_address: str,
-                             executor_class: type[Executor], log_stats: bool):
-        """Self-contained client mode, launch engine and coordinator process
-        as needed."""
-
-        parallel_config = vllm_config.parallel_config
-        local_engine_count = parallel_config.data_parallel_size_local
-        start_index = parallel_config.data_parallel_rank
-        host = parallel_config.data_parallel_master_ip
-
-        if len(self.core_engines) > 1:
-            self.resources.coordinator = DPCoordinator(parallel_config)
-
-        handshake_address = get_engine_client_zmq_addr(
-            local_only, host, parallel_config.data_parallel_rpc_port)
-
-        with zmq_socket_ctx(handshake_address, zmq.ROUTER,
-                            bind=True) as handshake_socket:
-
-            # Start local engines.
-            if local_engine_count:
-                # In server mode, start_index and local_start_index will
-                # both be 0.
-                self.resources.engine_manager = CoreEngineProcManager(
-                    EngineCoreProc.run_engine_core,
-                    vllm_config=vllm_config,
-                    executor_class=executor_class,
-                    log_stats=log_stats,
-                    handshake_address=handshake_address,
-                    on_head_node=True,
-                    local_engine_count=local_engine_count,
-                    start_index=start_index,
-                    local_start_index=local_start_index)
-
-            # Wait for engine core process(es) to start.
-            self._wait_for_engine_startup(handshake_socket, input_address,
-                                          output_address)
-
-    def _wait_for_engine_startup(self, handshake_socket: zmq.Socket,
-                                 input_address: str, output_address: str):
-        addresses = EngineZmqAddresses(
-            inputs=[input_address],
-            outputs=[output_address],
-        )
-
-        coordinator = self.resources.coordinator
-        if coordinator is not None:
-            addresses.coordinator_input, addresses.coordinator_output = (
-                coordinator.get_engine_socket_addresses())
-
-        proc_manager = self.resources.engine_manager
-        assert isinstance(proc_manager, (type(None), CoreEngineProcManager)), (
-            "_wait_for_engine_startup should only be "
-            "called with CoreEngineProcManager")
-
-        wait_for_engine_startup(
-            handshake_socket,
-            addresses,
-            self.core_engines,
-            self.vllm_config.parallel_config,
-            self.vllm_config.cache_config,
-            proc_manager,
-            coordinator.proc if coordinator else None,
-        )
-
     def shutdown(self):
         # Terminate background resources.
         self._finalizer()
@@ -583,7 +514,6 @@ class SyncMPClient(MPClient):
         # a ref to the client which prevents gc.
         ctx = self.ctx
         out_socket = self.resources.output_socket
-        assert out_socket is not None
         decoder = self.decoder
         utility_results = self.utility_results
         outputs_queue = self.outputs_queue
@@ -593,6 +523,7 @@ class SyncMPClient(MPClient):
         resources.shutdown_path = shutdown_path
 
         def process_outputs_socket():
+            assert isinstance(out_socket, zmq.Socket)
             shutdown_socket = ctx.socket(zmq.PAIR)
             try:
                 shutdown_socket.bind(shutdown_path)
@@ -609,7 +540,7 @@ class SyncMPClient(MPClient):
 
                     frames = out_socket.recv_multipart(copy=False)
                     resources.validate_alive(frames)
-                    outputs = decoder.decode(frames)
+                    outputs: EngineCoreOutputs = decoder.decode(frames)
                     if outputs.utility_output:
                         _process_utility_output(outputs.utility_output,
                                                 utility_results)
@@ -646,7 +577,7 @@ class SyncMPClient(MPClient):
         self.ensure_alive()
         self.free_pending_messages()
         # (Identity, RequestType, SerializedRequest)
-        msg = (self.core_engine.identity, request_type.value,
+        msg = (self.core_engine, request_type.value,
                *self.encoder.encode(request))
 
         if len(msg) <= 3:
@@ -812,7 +743,7 @@ class AsyncMPClient(MPClient):
     def _send_input(self,
                     request_type: EngineCoreRequestType,
                     request: Any,
-                    engine: Optional[CoreEngine] = None) -> Awaitable[Any]:
+                    engine: Optional[EngineIdentity] = None) -> Awaitable[Any]:
         if engine is None:
             engine = self.core_engine
 
@@ -820,7 +751,7 @@ class AsyncMPClient(MPClient):
         return self._send_input_message(message, engine, request)
 
     def _send_input_message(self, message: tuple[bytestr,
-                                                 ...], engine: CoreEngine,
+                                                 ...], engine: EngineIdentity,
                             objects: Any) -> Awaitable[Any]:
         """
         objects is a reference to retain until zmq is finished with the
@@ -829,7 +760,7 @@ class AsyncMPClient(MPClient):
         self.ensure_alive()
         self.free_pending_messages()
 
-        msg = (engine.identity, ) + message
+        msg = (engine, ) + message
         if not objects or len(msg) <= 3:
             # No auxiliary buffers => no tensor backing buffers in request.
             return self.input_socket.send_multipart(msg, copy=False)
@@ -850,7 +781,7 @@ class AsyncMPClient(MPClient):
                                               engine=self.core_engine)
 
     async def _call_utility_async(self, method: str, *args,
-                                  engine: CoreEngine) -> Any:
+                                  engine: EngineIdentity) -> Any:
         call_id = uuid.uuid1().int >> 64
         future = asyncio.get_running_loop().create_future()
         self.utility_results[call_id] = future
@@ -921,7 +852,7 @@ class AsyncMPClient(MPClient):
 
 class DPAsyncMPClient(AsyncMPClient):
     """Asyncio-compatible client for multi-proc, multi-engine (data parallel)
-    EngineCore."""
+    EngineCore. Assumes external load-balancing by default."""
 
     def __init__(self,
                  vllm_config: VllmConfig,
@@ -930,15 +861,12 @@ class DPAsyncMPClient(AsyncMPClient):
                  client_addresses: Optional[dict[str, str]] = None,
                  client_index: int = 0):
         self.current_wave = 0
-        # To route aborts to the correct engine.
-        self.reqs_in_flight: dict[str, CoreEngine] = {}
 
         super().__init__(vllm_config, executor_class, log_stats,
                          client_addresses, client_index)
 
-        assert len(self.core_engines) > 1
-
         # List of [waiting, running] pair per engine.
+        # Used only by DPLBAsyncMPClient subclass.
         self.lb_engines: list[list[int]] = []
 
         self.first_req_sock_addr = get_open_zmq_inproc_path()
@@ -969,6 +897,8 @@ class DPAsyncMPClient(AsyncMPClient):
                                      self.first_req_sock_addr,
                                      zmq.PAIR,
                                      bind=False) as first_req_rcv_socket:
+                assert isinstance(socket, zmq.asyncio.Socket)
+                assert isinstance(first_req_rcv_socket, zmq.asyncio.Socket)
                 # Send subscription message.
                 await socket.send(b'\x01')
 
@@ -1012,34 +942,75 @@ class DPAsyncMPClient(AsyncMPClient):
         resources.stats_update_task = asyncio.create_task(
             run_engine_stats_update_task())
 
-    def get_core_engine_for_request(self,
-                                    dp_rank: Optional[int] = None
-                                    ) -> CoreEngine:
-        if dp_rank is not None:
-            # engines are already in rank order
-            return self.core_engines[dp_rank]
+    async def add_request_async(self, request: EngineCoreRequest) -> None:
+        self._ensure_stats_update_task()
 
-        if not self.lb_engines:
-            return self.core_engines[0]
-        # TODO use P2C alg for larger DP sizes
-        num_engines = len(self.lb_engines)
-        min_counts = [sys.maxsize, sys.maxsize]
-        eng_index = 0
-        for i in range(num_engines):
-            # Start from client_index to help with balancing when engines
-            # are empty.
-            idx = (self.client_index + i) % num_engines
-            counts = self.lb_engines[idx]
-            if counts < min_counts:
-                min_counts = counts
-                eng_index = idx
-        # Adjust local counts for better balancing between stats updates
-        # from the coordinator (which happen every 100ms).
-        if min_counts[0]:
-            min_counts[0] += 1
-        else:
-            min_counts[1] += 1
-        return self.core_engines[eng_index]
+        request.current_wave = self.current_wave
+        request.client_index = self.client_index
+
+        chosen_engine = self.get_core_engine_for_request(request)
+        to_await = self._send_input(EngineCoreRequestType.ADD, request,
+                                    chosen_engine)
+        if not self.engines_running:
+            # Notify coordinator that we're sending a request
+            await self.first_req_send_socket.send(chosen_engine)
+
+        await to_await
+
+        self._ensure_output_queue_task()
+
+    def get_core_engine_for_request(self, request: EngineCoreRequest):
+        return self.core_engine
+
+
+class DPLBAsyncMPClient(DPAsyncMPClient):
+    """Asyncio-compatible client for multi-proc, multi-engine (data parallel)
+    EngineCore. Load-balances between multiple engine processes."""
+
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 executor_class: type[Executor],
+                 log_stats: bool,
+                 client_addresses: Optional[dict[str, str]] = None,
+                 client_index: int = 0):
+
+        # To route aborts to the correct engine.
+        self.reqs_in_flight: dict[str, EngineIdentity] = {}
+
+        super().__init__(vllm_config, executor_class, log_stats,
+                         client_addresses, client_index)
+
+        assert len(self.core_engines) > 1
+
+    def get_core_engine_for_request(
+            self, request: EngineCoreRequest) -> EngineIdentity:
+        # Engines are in rank order.
+        if (eng_index := request.data_parallel_rank) is None:
+            if not self.lb_engines:
+                return self.core_engine
+            # TODO use P2C alg for larger DP sizes
+            num_engines = len(self.lb_engines)
+            min_counts = [sys.maxsize, sys.maxsize]
+            eng_index = 0
+            for i in range(num_engines):
+                # Start from client_index to help with balancing when engines
+                # are empty.
+                idx = (self.client_index + i) % num_engines
+                counts = self.lb_engines[idx]
+                if counts < min_counts:
+                    min_counts = counts
+                    eng_index = idx
+            # Adjust local counts for better balancing between stats updates
+            # from the coordinator (which happen every 100ms).
+            if min_counts[0]:
+                min_counts[0] += 1
+            else:
+                min_counts[1] += 1
+
+        chosen_engine = self.core_engines[eng_index]
+        # Record which engine is chosen for this request, to handle aborts.
+        self.reqs_in_flight[request.request_id] = chosen_engine
+        return chosen_engine
 
     async def call_utility_async(self, method: str, *args) -> Any:
         # Only the result from the first engine is returned.
@@ -1048,28 +1019,8 @@ class DPAsyncMPClient(AsyncMPClient):
             for engine in self.core_engines
         ]))[0]
 
-    async def add_request_async(self, request: EngineCoreRequest) -> None:
-        self._ensure_stats_update_task()
-
-        request.current_wave = self.current_wave
-        request.client_index = self.client_index
-
-        chosen_engine = self.get_core_engine_for_request(
-            request.data_parallel_rank)
-        self.reqs_in_flight[request.request_id] = chosen_engine
-
-        to_await = self._send_input(EngineCoreRequestType.ADD, request,
-                                    chosen_engine)
-        if not self.engines_running:
-            # Notify coordinator that we're sending a request
-            await self.first_req_send_socket.send(chosen_engine.identity)
-
-        await to_await
-
-        self._ensure_output_queue_task()
-
     @staticmethod
-    async def process_engine_outputs(self: "DPAsyncMPClient",
+    async def process_engine_outputs(self: "DPLBAsyncMPClient",
                                      outputs: EngineCoreOutputs):
         if outputs.finished_requests and self.reqs_in_flight:
             for req_id in outputs.finished_requests:
@@ -1085,61 +1036,14 @@ class DPAsyncMPClient(AsyncMPClient):
                 await self._abort_requests(request_ids, engine)
             return
 
-        by_engine: dict[CoreEngine, list[str]] = {}
+        by_engine = defaultdict[EngineIdentity, list[str]](list)
         for req_id in request_ids:
             if engine := self.reqs_in_flight.get(req_id):
-                by_engine.setdefault(engine, []).append(req_id)
+                by_engine[engine].append(req_id)
         for engine, req_ids in by_engine.items():
             await self._abort_requests(req_ids, engine)
 
     async def _abort_requests(self, request_ids: list[str],
-                              engine: CoreEngine) -> None:
+                              engine: EngineIdentity) -> None:
         await self._send_input(EngineCoreRequestType.ABORT, request_ids,
                                engine)
-
-
-class RayDPClient(DPAsyncMPClient):
-    """
-    Ray-based client for multi-proc, multi-engine (data parallel)
-    EngineCore.
-    """
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        executor_class: type[Executor],
-        log_stats: bool,
-        client_addresses: Optional[dict[str, str]] = None,
-        client_index: int = 0,
-    ):
-        super().__init__(vllm_config, executor_class, log_stats,
-                         client_addresses, client_index)
-
-    def _init_engines_direct(self, vllm_config: VllmConfig, local_only: bool,
-                             local_start_index: int, input_address: str,
-                             output_address: str,
-                             executor_class: type[Executor], log_stats: bool):
-        """Self-contained client mode, launch engine and coordinator process
-        as needed."""
-
-        parallel_config = vllm_config.parallel_config
-        assert parallel_config.data_parallel_rank == 0
-        assert local_start_index == 0
-
-        addresses = EngineZmqAddresses(
-            inputs=[input_address],
-            outputs=[output_address],
-        )
-
-        if len(self.core_engines) > 1:
-            coordinator = DPCoordinator(parallel_config)
-            self.resources.coordinator = coordinator
-            addresses.coordinator_input, addresses.coordinator_output = (
-                coordinator.get_engine_socket_addresses())
-
-        # Start all engines.
-        self.resources.engine_manager = CoreEngineActorManager(
-            vllm_config=vllm_config,
-            addresses=addresses,
-            executor_class=executor_class,
-            log_stats=log_stats)
diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
new file mode 100644
index 0000000000000..c4012419411ad
--- /dev/null
+++ b/vllm/v1/engine/utils.py
@@ -0,0 +1,546 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import contextlib
+import weakref
+from collections.abc import Iterator
+from dataclasses import dataclass
+from enum import Enum, auto
+from multiprocessing import Process, connection
+from multiprocessing.process import BaseProcess
+from typing import TYPE_CHECKING, Callable, Optional, Union
+
+import msgspec
+import zmq
+
+from vllm.config import CacheConfig, ParallelConfig, VllmConfig
+from vllm.logger import init_logger
+from vllm.utils import get_mp_context, get_open_zmq_ipc_path, zmq_socket_ctx
+from vllm.v1.engine.coordinator import DPCoordinator
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.utils import get_engine_client_zmq_addr, shutdown
+
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+
+logger = init_logger(__name__)
+
+STARTUP_POLL_PERIOD_MS = 10000
+
+
+class CoreEngineState(Enum):
+    NEW = auto()
+    CONNECTED = auto()
+    READY = auto()
+
+
+class CoreEngine:
+    """One per data parallel rank, used to track state during handshaking."""
+
+    def __init__(self, index: int = 0, local: bool = True):
+        self.local = local
+        self.identity = index.to_bytes(2, "little")
+
+        self.state = CoreEngineState.NEW
+
+
+@dataclass
+class EngineZmqAddresses:
+    # ZMQ input socket addresses for each front-end client (requests)
+    inputs: list[str]
+    # ZMQ output socket addresses for each front-end client (responses)
+    outputs: list[str]
+    # ZMQ input socket address of DP coordinator if applicable
+    coordinator_input: Optional[str] = None
+    # ZMQ output socket address of DP coordinator if applicable
+    coordinator_output: Optional[str] = None
+    # ZMQ socket for front-end to connect to DP coordinator.
+    # Not used by engine, just relayed to front-end in handshake response.
+    # Only required for external DP LB case.
+    frontend_stats_publish_address: Optional[str] = None
+
+
+@dataclass
+class EngineHandshakeMetadata:
+    """Metadata sent to each engine process during startup handshake,
+    including addresses of the front-end ZMQ queues that they should
+    connect to.
+    """
+    addresses: EngineZmqAddresses
+    parallel_config: dict[str, Union[int, str]]
+
+
+class CoreEngineProcManager:
+    """
+    Utility class to handle creation, readiness, and shutdown
+    of background processes used by the AsyncLLM and LLMEngine.
+    """
+
+    def __init__(
+        self,
+        target_fn: Callable,
+        local_engine_count: int,
+        start_index: int,
+        local_start_index: int,
+        vllm_config: VllmConfig,
+        local_client: bool,
+        handshake_address: str,
+        executor_class: type[Executor],
+        log_stats: bool,
+        client_handshake_address: Optional[str] = None,
+    ):
+        context = get_mp_context()
+        common_kwargs = {
+            "vllm_config": vllm_config,
+            "local_client": local_client,
+            "handshake_address": handshake_address,
+            "executor_class": executor_class,
+            "log_stats": log_stats,
+        }
+
+        if client_handshake_address:
+            common_kwargs[
+                "client_handshake_address"] = client_handshake_address
+
+        self.processes: list[BaseProcess] = []
+        for index in range(local_engine_count):
+            local_index = local_start_index + index
+            global_index = start_index + index
+            # Start EngineCore in background process.
+            self.processes.append(
+                context.Process(target=target_fn,
+                                name=f"EngineCore_{global_index}",
+                                kwargs=common_kwargs | {
+                                    "dp_rank": global_index,
+                                    "local_dp_rank": local_index,
+                                }))
+
+        self._finalizer = weakref.finalize(self, shutdown, self.processes)
+        try:
+            for proc in self.processes:
+                proc.start()
+        finally:
+            # Kill other procs if not all are running.
+            if self.finished_procs():
+                self.close()
+
+    def close(self):
+        """Shutdown all procs."""
+        self._finalizer()
+
+    def join_first(self):
+        """Wait for any process to exit."""
+        connection.wait(proc.sentinel for proc in self.processes)
+
+    def sentinels(self) -> list:
+        return [proc.sentinel for proc in self.processes]
+
+    def finished_procs(self) -> dict[str, int]:
+        """Returns dict of proc name -> exit code for any finished procs."""
+        return {
+            proc.name: proc.exitcode
+            for proc in self.processes if proc.exitcode is not None
+        }
+
+
+class CoreEngineActorManager:
+    """
+    Utility class to handle creation, readiness, and shutdown
+    of core engine Ray actors used by the AsyncLLM and LLMEngine.
+
+    Different from CoreEngineProcManager, this class manages
+    core engines for both local and remote nodes.
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        addresses: EngineZmqAddresses,
+        executor_class: type[Executor],
+        log_stats: bool,
+        placement_groups: Optional[list["PlacementGroup"]] = None,
+        local_dp_ranks: Optional[list[int]] = None,
+    ):
+        import copy
+
+        import ray
+        from ray.util.scheduling_strategies import (
+            PlacementGroupSchedulingStrategy)
+
+        from vllm.v1.engine.core import DPEngineCoreActor
+
+        self.local_engine_actors: list[ray.ActorHandle] = []
+        self.remote_engine_actors: list[ray.ActorHandle] = []
+        dp_size = vllm_config.parallel_config.data_parallel_size
+        local_engine_count = \
+            vllm_config.parallel_config.data_parallel_size_local
+        world_size = vllm_config.parallel_config.world_size
+
+        if ray.is_initialized():
+            logger.info(
+                "Ray is already initialized. Skipping Ray initialization.")
+        else:
+            ray.init()
+
+        if placement_groups is not None:
+            assert local_dp_ranks is not None, (
+                "local_dp_ranks must be provided if "
+                "placement_groups is provided")
+            assert len(placement_groups) == len(local_dp_ranks), (
+                "placement_groups and local_dp_ranks must "
+                "have the same length")
+            logger.info("Using provided placement groups")
+            # TODO(rui): validate passed-in placement groups
+            self.created_placement_groups = []
+        else:
+            placement_groups, local_dp_ranks = \
+                CoreEngineActorManager.create_dp_placement_groups(vllm_config)
+            self.created_placement_groups = placement_groups
+        assert len(placement_groups) == dp_size, (
+            "Number of placement groups must match data parallel size")
+
+        refs = []
+        for index in range(dp_size):
+            local_index = local_dp_ranks[index]
+            dp_vllm_config = copy.deepcopy(vllm_config)
+            pg = placement_groups[index]
+            dp_vllm_config.parallel_config.placement_group = pg
+            local_client = index < local_engine_count
+            actor = ray.remote(DPEngineCoreActor).options(
+                scheduling_strategy=PlacementGroupSchedulingStrategy(
+                    placement_group=pg,
+                    placement_group_bundle_index=world_size,
+                )).remote(vllm_config=dp_vllm_config,
+                          executor_class=executor_class,
+                          log_stats=log_stats,
+                          local_client=local_client,
+                          addresses=addresses,
+                          dp_rank=index,
+                          local_dp_rank=local_index)
+            if local_client:
+                self.local_engine_actors.append(actor)
+            else:
+                self.remote_engine_actors.append(actor)
+            refs.append(actor.wait_for_init.remote())
+
+        ray.get(refs)
+        self.run_refs = []
+        for actor in self.local_engine_actors + self.remote_engine_actors:
+            self.run_refs.append(actor.run.remote())
+
+    @staticmethod
+    def create_dp_placement_groups(
+            vllm_config: VllmConfig
+    ) -> tuple[list["PlacementGroup"], list[int]]:
+
+        import ray
+        from ray._private.state import available_resources_per_node
+        from ray.util.state import list_nodes
+
+        logger.info("Creating placement groups for data parallel")
+        dp_master_ip = \
+            vllm_config.parallel_config.data_parallel_master_ip
+        dp_size = vllm_config.parallel_config.data_parallel_size
+        local_engine_count = \
+            vllm_config.parallel_config.data_parallel_size_local
+
+        nodes = sorted(list_nodes(),
+                       key=lambda node: node.node_ip != dp_master_ip)
+        assert nodes[0].node_ip == dp_master_ip, (
+            "The first node must be the head node")
+        assert len(nodes) == 1 or nodes[1].node_ip != dp_master_ip, (
+            "There can only be one head node")
+
+        available_resources = available_resources_per_node()
+        world_size = vllm_config.parallel_config.world_size
+        placement_groups: list[PlacementGroup] = []
+        local_dp_ranks: list[int] = []
+
+        for node in nodes:
+            node_ip = node.node_ip
+            node_resources = available_resources[node.node_id]
+            # For now, each DP rank can only be assigned to one node
+            # TODO(rui): support allocating a single DP rank
+            # to multiple nodes
+            available_engine_count = int(node_resources["GPU"]) // world_size
+            if node_ip == dp_master_ip:
+                assert available_engine_count >= local_engine_count, (
+                    "Not enough resources to allocate DP ranks "
+                    f"on DP master node {node_ip}")
+                for i in range(local_engine_count):
+                    bundles = [{
+                        "GPU": 1.0,
+                        "node:" + dp_master_ip: 0.001
+                    }] * world_size + [{
+                        "CPU": 1.0
+                    }]
+                    pg = ray.util.placement_group(
+                        name=f"dp_rank_{len(placement_groups)}",
+                        strategy="STRICT_PACK",
+                        bundles=bundles,
+                    )
+                    placement_groups.append(pg)
+                    local_dp_ranks.append(i)
+            else:
+                for i in range(available_engine_count):
+                    if len(placement_groups) == dp_size:
+                        break
+                    bundles = [{"GPU": 1.0}] * world_size + [{"CPU": 1.0}]
+                    pg = ray.util.placement_group(
+                        name=f"dp_rank_{len(placement_groups)}",
+                        strategy="STRICT_PACK",
+                        bundles=bundles,
+                    )
+                    placement_groups.append(pg)
+                    local_dp_ranks.append(i)
+        return placement_groups, local_dp_ranks
+
+    def get_run_refs(self):
+        return self.run_refs
+
+    def close(self):
+        import ray
+        for actor in self.local_engine_actors + self.remote_engine_actors:
+            ray.kill(actor)
+        for pg in self.created_placement_groups:
+            ray.util.remove_placement_group(pg)
+
+
+@contextlib.contextmanager
+def launch_core_engines(
+    vllm_config: VllmConfig,
+    executor_class: type[Executor],
+    log_stats: bool,
+    num_api_servers: int = 1,
+) -> Iterator[tuple[
+        Optional[Union[CoreEngineProcManager, CoreEngineActorManager]],
+        Optional[DPCoordinator],
+        EngineZmqAddresses,
+]]:
+    """Launch engine and DP coordinator processes as needed."""
+
+    parallel_config = vllm_config.parallel_config
+    dp_size = parallel_config.data_parallel_size
+    local_engine_count = parallel_config.data_parallel_size_local
+    local_start_index = parallel_config.data_parallel_rank_local
+    dp_rank = parallel_config.data_parallel_rank
+    host = parallel_config.data_parallel_master_ip
+    external_dp_lb = parallel_config.data_parallel_external_lb
+
+    # In offline mode there is an LLM instance per DP rank and
+    # one core engine per LLM, see
+    # examples/offline_inference/data_parallel.py.
+    offline_mode = local_start_index is not None
+
+    # client_local_only = True for cases where this front-end
+    # sends requests only to colocated engines.
+    client_local_only = offline_mode or external_dp_lb or (local_engine_count
+                                                           == dp_size)
+
+    # Set up input and output addresses.
+    addresses = EngineZmqAddresses(
+        inputs=[
+            get_engine_client_zmq_addr(client_local_only, host)
+            for _ in range(num_api_servers)
+        ],
+        outputs=[
+            get_engine_client_zmq_addr(client_local_only, host)
+            for _ in range(num_api_servers)
+        ],
+    )
+
+    # Run the DP Coordinator process with rank 0 when in
+    # online DP mode.
+    run_coordinator = dp_size > 1 and not offline_mode and dp_rank == 0
+
+    if run_coordinator:
+        coordinator = DPCoordinator(parallel_config)
+
+        addresses.coordinator_input, addresses.coordinator_output = (
+            coordinator.get_engine_socket_addresses())
+        addresses.frontend_stats_publish_address = (
+            coordinator.get_stats_publish_address())
+
+        logger.info("Started DP Coordinator process (PID: %d)",
+                    coordinator.proc.pid)
+    else:
+        coordinator = None
+
+    if parallel_config.data_parallel_backend == "ray":
+        logger.info("Starting ray-based data parallel backend")
+
+        engine_actor_manager = CoreEngineActorManager(
+            vllm_config=vllm_config,
+            addresses=addresses,
+            executor_class=executor_class,
+            log_stats=log_stats,
+        )
+
+        yield engine_actor_manager, coordinator, addresses
+        return
+
+    if offline_mode or (external_dp_lb and dp_rank > 0):
+        assert local_engine_count == 1
+        engines_to_handshake = [CoreEngine(index=dp_rank, local=True)]
+    else:
+        engines_to_handshake = [
+            CoreEngine(index=i, local=(i < local_engine_count))
+            for i in range(dp_size)
+        ]
+
+    # Whether the started engines will handshake only with co-located
+    # front-end processes. In external_dp_lb mode, ranks > 0 handshake with
+    # their co-located frontend and also the rank 0 front-end, and hence this
+    # will be False.
+    handshake_local_only = offline_mode or local_engine_count == dp_size
+
+    handshake_address = get_engine_client_zmq_addr(
+        handshake_local_only, host, parallel_config.data_parallel_rpc_port)
+
+    if external_dp_lb and dp_rank > 0:
+        assert not handshake_local_only
+        local_handshake_address = get_open_zmq_ipc_path()
+        client_handshake_address = local_handshake_address
+    else:
+        local_handshake_address = handshake_address
+        client_handshake_address = None
+
+    with zmq_socket_ctx(local_handshake_address, zmq.ROUTER,
+                        bind=True) as handshake_socket:
+
+        from vllm.v1.engine.core import EngineCoreProc
+
+        # Start local engines.
+        if local_engine_count:
+            # In server mode, start_index and local_start_index will
+            # both be 0.
+            local_engine_manager = CoreEngineProcManager(
+                EngineCoreProc.run_engine_core,
+                vllm_config=vllm_config,
+                executor_class=executor_class,
+                log_stats=log_stats,
+                handshake_address=handshake_address,
+                client_handshake_address=client_handshake_address,
+                local_client=True,
+                local_engine_count=local_engine_count,
+                start_index=dp_rank,
+                local_start_index=local_start_index or 0)
+        else:
+            local_engine_manager = None
+
+        yield local_engine_manager, coordinator, addresses
+
+        # Now wait for engines to start.
+        wait_for_engine_startup(
+            handshake_socket,
+            addresses,
+            engines_to_handshake,
+            parallel_config,
+            vllm_config.cache_config,
+            local_engine_manager,
+            coordinator.proc if coordinator else None,
+        )
+
+
+def wait_for_engine_startup(
+    handshake_socket: zmq.Socket,
+    addresses: EngineZmqAddresses,
+    core_engines: list[CoreEngine],
+    parallel_config: ParallelConfig,
+    cache_config: CacheConfig,
+    proc_manager: Optional[CoreEngineProcManager],
+    coord_process: Optional[Process],
+):
+    # Wait for engine core process(es) to send ready messages.
+    local_count = parallel_config.data_parallel_size_local
+    remote_count = len(core_engines) - local_count
+    # [local, remote] counts
+    conn_pending, start_pending = [local_count, remote_count], [0, 0]
+    poller = zmq.Poller()
+    poller.register(handshake_socket, zmq.POLLIN)
+
+    if proc_manager is not None:
+        for sentinel in proc_manager.sentinels():
+            poller.register(sentinel, zmq.POLLIN)
+    if coord_process is not None:
+        poller.register(coord_process.sentinel, zmq.POLLIN)
+    while any(conn_pending) or any(start_pending):
+        events = poller.poll(STARTUP_POLL_PERIOD_MS)
+        if not events:
+            if any(conn_pending):
+                logger.debug(
+                    "Waiting for %d local, %d remote core engine proc(s) "
+                    "to connect.", *conn_pending)
+            if any(start_pending):
+                logger.debug(
+                    "Waiting for %d local, %d remote core engine proc(s) "
+                    "to start.", *start_pending)
+            continue
+        if len(events) > 1 or events[0][0] != handshake_socket:
+            # One of the local core processes exited.
+            finished = proc_manager.finished_procs() if proc_manager else {}
+            if coord_process is not None and coord_process.exitcode is not None:
+                finished[coord_process.name] = coord_process.exitcode
+            raise RuntimeError("Engine core initialization failed. "
+                               "See root cause above. "
+                               f"Failed core proc(s): {finished}")
+
+        # Receive HELLO and READY messages from the input socket.
+        eng_identity, ready_msg_bytes = handshake_socket.recv_multipart()
+        eng_index = int.from_bytes(eng_identity, "little")
+        engine = next((e for e in core_engines if e.identity == eng_identity),
+                      None)
+        if engine is None:
+            raise RuntimeError(f"Message from engine with unexpected data "
+                               f"parallel rank: {eng_index}")
+        msg = msgspec.msgpack.decode(ready_msg_bytes)
+        status, local = msg["status"], msg["local"]
+        if local != engine.local:
+            raise RuntimeError(f"{status} message from "
+                               f"{'local' if local else 'remote'} "
+                               f"engine {eng_index}, expected it to be "
+                               f"{'local' if engine.local else 'remote'}")
+
+        if status == "HELLO" and engine.state == CoreEngineState.NEW:
+
+            # Send init message with DP config info.
+            init_message = msgspec.msgpack.encode(
+                EngineHandshakeMetadata(
+                    addresses=addresses,
+                    parallel_config={
+                        "data_parallel_master_ip":
+                        parallel_config.data_parallel_master_ip,
+                        "data_parallel_master_port":
+                        parallel_config.data_parallel_master_port,
+                        "data_parallel_size":
+                        parallel_config.data_parallel_size,
+                    }))
+            handshake_socket.send_multipart((eng_identity, init_message),
+                                            copy=False)
+            conn_pending[0 if local else 1] -= 1
+            start_pending[0 if local else 1] += 1
+            engine.state = CoreEngineState.CONNECTED
+        elif status == "READY" and engine.state == CoreEngineState.CONNECTED:
+            # Setup KV cache config with initialization state from
+            # engine core process. Sum values from all engines in DP case.
+            num_gpu_blocks = cache_config.num_gpu_blocks or 0
+            num_gpu_blocks += msg["num_gpu_blocks"]
+            cache_config.num_gpu_blocks = num_gpu_blocks
+
+            # In external DP LB mode, the coordinator address that the
+            # front-end procs connect to is obtained from rank 0 via
+            # one of the engine handshakes, and passed to the local
+            # front-end process in the response from the other.
+            if addresses.frontend_stats_publish_address is None:
+                addresses.frontend_stats_publish_address = msg.get(
+                    "dp_stats_address")
+
+            start_pending[0 if local else 1] -= 1
+            engine.state = CoreEngineState.READY
+        else:
+            raise RuntimeError(f"Unexpected {status} message for "
+                               f"{'local' if local else 'remote'} engine "
+                               f"{eng_index} in {engine.state} state.")
+
+        logger.debug("%s from %s core engine process %s.", status,
+                     "local" if local else "remote", eng_index)
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 192c9067740c2..6b40cf6fd36d5 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,44 +1,35 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
 import argparse
 import multiprocessing
 import time
 import weakref
 from collections import defaultdict
 from collections.abc import Sequence
-from dataclasses import dataclass
-from enum import Enum, auto
-from multiprocessing import Process, connection
+from multiprocessing import connection
 from multiprocessing.process import BaseProcess
 from typing import (TYPE_CHECKING, Any, Callable, Generic, Optional, TypeVar,
                     Union, overload)
 
-import msgspec
 import torch
-import zmq
 
-from vllm.config import CacheConfig, ParallelConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.models.utils import extract_layer_index
 from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
                                   usage_message)
-from vllm.utils import (get_mp_context, get_open_port, get_open_zmq_ipc_path,
-                        get_tcp_uri, kill_process_tree)
-from vllm.v1.executor.abstract import Executor
+from vllm.utils import (get_open_port, get_open_zmq_ipc_path, get_tcp_uri,
+                        kill_process_tree)
 
 if TYPE_CHECKING:
-    from ray.util.placement_group import PlacementGroup
-
     from vllm.attention.layer import Attention
     from vllm.v1.engine.coordinator import DPCoordinator
+    from vllm.v1.engine.utils import (CoreEngineActorManager,
+                                      CoreEngineProcManager)
 
 logger = init_logger(__name__)
 
 T = TypeVar("T")
 
-STARTUP_POLL_PERIOD_MS = 10000
-
 
 class ConstantList(Generic[T], Sequence):
 
@@ -111,49 +102,18 @@ class ConstantList(Generic[T], Sequence):
 def get_engine_client_zmq_addr(local_only: bool,
                                host: str,
                                port: int = 0) -> str:
+    """Assign a new ZMQ socket address.
+
+    If local_only is True, participants are colocated and so a unique IPC
+    address will be returned.
+
+    Otherwise, the provided host and port will be used to construct a TCP
+    address (port == 0 means assign an available port)."""
+
     return get_open_zmq_ipc_path() if local_only else (get_tcp_uri(
         host, port or get_open_port()))
 
 
-class CoreEngineState(Enum):
-    NEW = auto()
-    CONNECTED = auto()
-    READY = auto()
-
-
-class CoreEngine:
-    """One per data parallel rank."""
-
-    def __init__(self, index: int = 0, local: bool = True):
-        self.local = local
-        self.index = index
-        self.identity = index.to_bytes(2, "little")
-
-        self.state = CoreEngineState.NEW
-
-
-@dataclass
-class EngineZmqAddresses:
-    # ZMQ input socket addresses for each front-end client (requests)
-    inputs: list[str]
-    # ZMQ output socket addresses for each front-end client (responses)
-    outputs: list[str]
-    # ZMQ input socket address of DP coordinator if applicable
-    coordinator_input: Optional[str] = None
-    # ZMQ output socket address of DP coordinator if applicable
-    coordinator_output: Optional[str] = None
-
-
-@dataclass
-class EngineHandshakeMetadata:
-    """Metadata sent to each engine process during startup handshake,
-    including addresses of the front-end ZMQ queues that they should
-    connect to.
-    """
-    addresses: EngineZmqAddresses
-    parallel_config: dict[str, Union[int, str]]
-
-
 class APIServerProcessManager:
     """Manages a group of API server processes.
     
@@ -219,339 +179,10 @@ class APIServerProcessManager:
         self._finalizer()
 
 
-class CoreEngineProcManager:
-    """
-    Utility class to handle creation, readiness, and shutdown
-    of background processes used by the AsyncLLM and LLMEngine.
-    """
-
-    def __init__(
-        self,
-        target_fn: Callable,
-        local_engine_count: int,
-        start_index: int,
-        local_start_index: int,
-        vllm_config: VllmConfig,
-        on_head_node: bool,
-        handshake_address: str,
-        executor_class: type[Executor],
-        log_stats: bool,
-    ):
-        context = get_mp_context()
-        common_kwargs = {
-            "vllm_config": vllm_config,
-            "on_head_node": on_head_node,
-            "handshake_address": handshake_address,
-            "executor_class": executor_class,
-            "log_stats": log_stats,
-        }
-
-        self.processes: list[BaseProcess] = []
-        for index in range(local_engine_count):
-            local_index = local_start_index + index
-            global_index = start_index + index
-            # Start EngineCore in background process.
-            self.processes.append(
-                context.Process(target=target_fn,
-                                name=f"EngineCore_{global_index}",
-                                kwargs=common_kwargs | {
-                                    "dp_rank": global_index,
-                                    "local_dp_rank": local_index,
-                                }))
-
-        self._finalizer = weakref.finalize(self, shutdown, self.processes)
-        try:
-            for proc in self.processes:
-                proc.start()
-        finally:
-            # Kill other procs if not all are running.
-            if self.finished_procs():
-                self.close()
-
-    def close(self):
-        """Shutdown all procs."""
-        self._finalizer()
-
-    def join_first(self):
-        """Wait for any process to exit."""
-        connection.wait(proc.sentinel for proc in self.processes)
-
-    def sentinels(self) -> list:
-        return [proc.sentinel for proc in self.processes]
-
-    def finished_procs(self) -> dict[str, int]:
-        """Returns dict of proc name -> exit code for any finished procs."""
-        return {
-            proc.name: proc.exitcode
-            for proc in self.processes if proc.exitcode is not None
-        }
-
-
-class CoreEngineActorManager:
-    """
-    Utility class to handle creation, readiness, and shutdown
-    of core engine Ray actors used by the AsyncLLM and LLMEngine.
-
-    Different from CoreEngineProcManager, this class manages
-    core engines for both local and remote nodes.
-    """
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        addresses: EngineZmqAddresses,
-        executor_class: type[Executor],
-        log_stats: bool,
-        placement_groups: Optional[list["PlacementGroup"]] = None,
-        local_dp_ranks: Optional[list[int]] = None,
-    ):
-        import copy
-
-        import ray
-        from ray.util.scheduling_strategies import (
-            PlacementGroupSchedulingStrategy)
-
-        from vllm.v1.engine.core import DPEngineCoreActor
-
-        self.local_engine_actors: list[ray.ActorHandle] = []
-        self.remote_engine_actors: list[ray.ActorHandle] = []
-        dp_size = vllm_config.parallel_config.data_parallel_size
-        local_engine_count = \
-            vllm_config.parallel_config.data_parallel_size_local
-        world_size = vllm_config.parallel_config.world_size
-
-        if ray.is_initialized():
-            logger.info(
-                "Ray is already initialized. Skipping Ray initialization.")
-        else:
-            ray.init()
-
-        if placement_groups is not None:
-            assert local_dp_ranks is not None, (
-                "local_dp_ranks must be provided if "
-                "placement_groups is provided")
-            assert len(placement_groups) == len(local_dp_ranks), (
-                "placement_groups and local_dp_ranks must "
-                "have the same length")
-            logger.info("Using provided placement groups")
-            # TODO(rui): validate passed-in placement groups
-            self.created_placement_groups = []
-        else:
-            placement_groups, local_dp_ranks = \
-                CoreEngineActorManager.create_dp_placement_groups(vllm_config)
-            self.created_placement_groups = placement_groups
-        assert len(placement_groups) == dp_size, (
-            "Number of placement groups must match data parallel size")
-
-        refs = []
-        for index in range(dp_size):
-            local_index = local_dp_ranks[index]
-            dp_vllm_config = copy.deepcopy(vllm_config)
-            pg = placement_groups[index]
-            dp_vllm_config.parallel_config.placement_group = pg
-            on_head_node = index < local_engine_count
-            actor = ray.remote(DPEngineCoreActor).options(
-                scheduling_strategy=PlacementGroupSchedulingStrategy(
-                    placement_group=pg,
-                    placement_group_bundle_index=world_size,
-                )).remote(vllm_config=dp_vllm_config,
-                          executor_class=executor_class,
-                          log_stats=log_stats,
-                          on_head_node=on_head_node,
-                          addresses=addresses,
-                          dp_rank=index,
-                          local_dp_rank=local_index)
-            if on_head_node:
-                self.local_engine_actors.append(actor)
-            else:
-                self.remote_engine_actors.append(actor)
-            refs.append(actor.wait_for_init.remote())
-
-        ray.get(refs)
-        self.run_refs = []
-        for actor in self.local_engine_actors + self.remote_engine_actors:
-            self.run_refs.append(actor.run.remote())
-
-    @staticmethod
-    def create_dp_placement_groups(
-            vllm_config: VllmConfig
-    ) -> tuple[list["PlacementGroup"], list[int]]:
-
-        import ray
-        from ray._private.state import available_resources_per_node
-        from ray.util.state import list_nodes
-
-        logger.info("Creating placement groups for data parallel")
-        dp_master_ip = \
-            vllm_config.parallel_config.data_parallel_master_ip
-        dp_size = vllm_config.parallel_config.data_parallel_size
-        local_engine_count = \
-            vllm_config.parallel_config.data_parallel_size_local
-
-        nodes = list_nodes()
-        nodes = sorted(list_nodes(),
-                       key=lambda node: node.node_ip != dp_master_ip)
-        assert nodes[0].node_ip == dp_master_ip, (
-            "The first node must be the head node")
-        assert len(nodes) == 1 or nodes[1].node_ip != dp_master_ip, (
-            "There can only be one head node")
-
-        available_resources = available_resources_per_node()
-        world_size = vllm_config.parallel_config.world_size
-        placement_groups: list[PlacementGroup] = []
-        local_dp_ranks: list[int] = []
-
-        for node in nodes:
-            node_ip = node.node_ip
-            node_resources = available_resources[node.node_id]
-            # For now, each DP rank can only be assigned to one node
-            # TODO(rui): support allocating a single DP rank
-            # to multiple nodes
-            available_engine_count = int(node_resources["GPU"]) // world_size
-            if node_ip == dp_master_ip:
-                assert available_engine_count >= local_engine_count, (
-                    "Not enough resources to allocate DP ranks "
-                    f"on DP master node {node_ip}")
-                for i in range(local_engine_count):
-                    bundles = [{
-                        "GPU": 1.0,
-                        "node:" + dp_master_ip: 0.001
-                    }] * world_size + [{
-                        "CPU": 1.0
-                    }]
-                    pg = ray.util.placement_group(
-                        name=f"dp_rank_{len(placement_groups)}",
-                        strategy="STRICT_PACK",
-                        bundles=bundles,
-                    )
-                    placement_groups.append(pg)
-                    local_dp_ranks.append(i)
-            else:
-                for i in range(available_engine_count):
-                    if len(placement_groups) == dp_size:
-                        break
-                    bundles = [{"GPU": 1.0}] * world_size + [{"CPU": 1.0}]
-                    pg = ray.util.placement_group(
-                        name=f"dp_rank_{len(placement_groups)}",
-                        strategy="STRICT_PACK",
-                        bundles=bundles,
-                    )
-                    placement_groups.append(pg)
-                    local_dp_ranks.append(i)
-        return placement_groups, local_dp_ranks
-
-    def get_run_refs(self):
-        return self.run_refs
-
-    def close(self):
-        import ray
-        for actor in self.local_engine_actors + self.remote_engine_actors:
-            ray.kill(actor)
-        for pg in self.created_placement_groups:
-            ray.util.remove_placement_group(pg)
-
-
-def wait_for_engine_startup(
-    handshake_socket: zmq.Socket,
-    addresses: EngineZmqAddresses,
-    core_engines: list[CoreEngine],
-    parallel_config: ParallelConfig,
-    cache_config: CacheConfig,
-    proc_manager: Optional[CoreEngineProcManager],
-    coord_process: Optional[Process],
-):
-
-    # Wait for engine core process(es) to send ready messages.
-    local_count = parallel_config.data_parallel_size_local
-    remote_count = len(core_engines) - local_count
-    # [local, remote] counts
-    conn_pending, start_pending = [local_count, remote_count], [0, 0]
-    poller = zmq.Poller()
-    poller.register(handshake_socket, zmq.POLLIN)
-
-    if proc_manager is not None:
-        for sentinel in proc_manager.sentinels():
-            poller.register(sentinel, zmq.POLLIN)
-    if coord_process is not None:
-        poller.register(coord_process.sentinel, zmq.POLLIN)
-    while any(conn_pending) or any(start_pending):
-        events = poller.poll(STARTUP_POLL_PERIOD_MS)
-        if not events:
-            if any(conn_pending):
-                logger.debug(
-                    "Waiting for %d local, %d remote core engine proc(s) "
-                    "to connect.", *conn_pending)
-            if any(start_pending):
-                logger.debug(
-                    "Waiting for %d local, %d remote core engine proc(s) "
-                    "to start.", *start_pending)
-            continue
-        if len(events) > 1 or events[0][0] != handshake_socket:
-            # One of the local core processes exited.
-            finished = proc_manager.finished_procs() if proc_manager else {}
-            if coord_process is not None and coord_process.exitcode is not None:
-                finished[coord_process.name] = coord_process.exitcode
-            raise RuntimeError("Engine core initialization failed. "
-                               "See root cause above. "
-                               f"Failed core proc(s): {finished}")
-
-        # Receive HELLO and READY messages from the input socket.
-        eng_identity, ready_msg_bytes = handshake_socket.recv_multipart()
-        eng_index = int.from_bytes(eng_identity, "little")
-        engine = next((e for e in core_engines if e.identity == eng_identity),
-                      None)
-        if engine is None:
-            raise RuntimeError(f"Message from engine with unexpected data "
-                               f"parallel rank: {eng_index}")
-        msg = msgspec.msgpack.decode(ready_msg_bytes)
-        status, local = msg["status"], msg["local"]
-        if local != engine.local:
-            raise RuntimeError(f"{status} message from "
-                               f"{'local' if local else 'remote'} "
-                               f"engine {eng_index}, expected it to be "
-                               f"{'local' if engine.local else 'remote'}")
-
-        if status == "HELLO" and engine.state == CoreEngineState.NEW:
-
-            # Send init message with DP config info.
-            init_message = msgspec.msgpack.encode(
-                EngineHandshakeMetadata(
-                    addresses=addresses,
-                    parallel_config={
-                        "data_parallel_master_ip":
-                        parallel_config.data_parallel_master_ip,
-                        "data_parallel_master_port":
-                        parallel_config.data_parallel_master_port,
-                        "data_parallel_size":
-                        parallel_config.data_parallel_size,
-                    }))
-            handshake_socket.send_multipart((eng_identity, init_message),
-                                            copy=False)
-            conn_pending[0 if local else 1] -= 1
-            start_pending[0 if local else 1] += 1
-            engine.state = CoreEngineState.CONNECTED
-        elif status == "READY" and (engine.state == CoreEngineState.CONNECTED):
-            # Setup KV cache config with initialization state from
-            # engine core process. Sum values from all engines in DP case.
-            num_gpu_blocks = cache_config.num_gpu_blocks or 0
-            num_gpu_blocks += msg["num_gpu_blocks"]
-            cache_config.num_gpu_blocks = num_gpu_blocks
-
-            start_pending[0 if local else 1] -= 1
-            engine.state = CoreEngineState.READY
-        else:
-            raise RuntimeError(f"Unexpected {status} message for "
-                               f"{'local' if local else 'remote'} engine "
-                               f"{eng_index} in {engine.state} state.")
-
-        logger.debug("%s from %s core engine process %s.", status,
-                     "local" if local else "remote", eng_index)
-
-
 def wait_for_completion_or_failure(
         api_server_manager: APIServerProcessManager,
-        engine_manager: Optional[Union[CoreEngineProcManager,
-                                       CoreEngineActorManager]] = None,
+        engine_manager: Optional[Union["CoreEngineProcManager",
+                                       "CoreEngineActorManager"]] = None,
         coordinator: Optional["DPCoordinator"] = None) -> None:
     """Wait for all processes to complete or detect if any fail.
     
@@ -565,6 +196,9 @@ def wait_for_completion_or_failure(
         coordinator: The coordinator for data parallel.
     """
 
+    from vllm.v1.engine.utils import (CoreEngineActorManager,
+                                      CoreEngineProcManager)
+
     try:
         logger.info("Waiting for API servers to complete ...")
         # Create a mapping of sentinels to their corresponding processes

From 3dd359147d81fd0788c18968130ecd6c55dac415 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 3 Jul 2025 02:13:51 +0200
Subject: [PATCH 056/167] [Docs] Update EAGLE example (#20375)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 docs/features/spec_decode.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/features/spec_decode.md b/docs/features/spec_decode.md
index 7055cde1e9930..abda7db53f912 100644
--- a/docs/features/spec_decode.md
+++ b/docs/features/spec_decode.md
@@ -201,6 +201,7 @@ an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https
         speculative_config={
             "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
             "draft_tensor_parallel_size": 1,
+            "num_speculative_tokens": 2,
         },
     )
 

From bdb84e26b06c6652d4afe742cd6fce21ea383fe4 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Wed, 2 Jul 2025 20:15:11 -0400
Subject: [PATCH 057/167] [Bugfix] Fixes for FlashInfer's TORCH_CUDA_ARCH_LIST
 (#20136)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: Tyler Michael Smith <tysmith@redhat.com>
---
 docker/Dockerfile | 55 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 38 insertions(+), 17 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index a71b052bfca25..d1009fb4fb18b 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -374,23 +374,44 @@ ARG FLASHINFER_CUDA128_INDEX_URL="https://download.pytorch.org/whl/cu128/flashin
 ARG FLASHINFER_CUDA128_WHEEL="flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl"
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 ARG FLASHINFER_GIT_REF="v0.2.6.post1"
-RUN --mount=type=cache,target=/root/.cache/uv \
-. /etc/environment && \
-if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    # FlashInfer already has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use
-    if [[ "$CUDA_VERSION" == 12.8* ]]; then \
-        uv pip install --system ${FLASHINFER_CUDA128_INDEX_URL}/${FLASHINFER_CUDA128_WHEEL} ; \
-    else \
-        export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0' && \
-        git clone ${FLASHINFER_GIT_REPO} --single-branch --branch ${FLASHINFER_GIT_REF} --recursive && \
-        # Needed to build AOT kernels
-        (cd flashinfer && \
-            python3 -m flashinfer.aot && \
-            uv pip install --system --no-build-isolation . \
-        ) && \
-        rm -rf flashinfer; \
-    fi \
-fi
+RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
+  . /etc/environment
+  if [ "$TARGETPLATFORM" != "linux/arm64" ]; then
+      # FlashInfer already has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use
+      if [[ "$CUDA_VERSION" == 12.8* ]]; then
+          uv pip install --system ${FLASHINFER_CUDA128_INDEX_URL}/${FLASHINFER_CUDA128_WHEEL}
+      else
+          export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0'
+          git clone ${FLASHINFER_GIT_REPO} --single-branch --branch ${FLASHINFER_GIT_REF} --recursive
+          # Needed to build AOT kernels
+          (cd flashinfer && \
+              python3 -m flashinfer.aot && \
+              uv pip install --system --no-build-isolation . \
+          )
+          rm -rf flashinfer
+
+          # Default arches (skipping 10.0a and 12.0 since these need 12.8)
+          # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
+          TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
+          if [[ "${CUDA_VERSION}" == 11.* ]]; then
+              TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
+          fi
+          echo "🏗️  Building FlashInfer for arches: ${TORCH_CUDA_ARCH_LIST}"
+
+          git clone --depth 1 --recursive --shallow-submodules \
+            --branch v0.2.6.post1 \
+            https://github.com/flashinfer-ai/flashinfer.git flashinfer
+
+          pushd flashinfer
+            python3 -m flashinfer.aot
+            TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}" \
+              uv pip install --system --no-build-isolation .
+          popd
+
+          rm -rf flashinfer
+      fi \
+  fi
+BASH
 COPY examples examples
 COPY benchmarks benchmarks
 COPY ./vllm/collect_env.py .

From 059d4cdb4907025b27f1326b47566224a1487b06 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 3 Jul 2025 01:15:32 +0100
Subject: [PATCH 058/167] [BugFix] Fix DP headless mode arg validation (#20398)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/entrypoints/cli/serve.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index 2ca31510208df..23ac468ffa115 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -119,6 +119,10 @@ def run_headless(args: argparse.Namespace):
     if not envs.VLLM_USE_V1:
         raise ValueError("Headless mode is only supported for V1")
 
+    if engine_args.data_parallel_rank is not None:
+        raise ValueError("data_parallel_rank is not applicable in "
+                         "headless mode")
+
     parallel_config = vllm_config.parallel_config
     local_engine_count = parallel_config.data_parallel_size_local
 
@@ -126,10 +130,6 @@ def run_headless(args: argparse.Namespace):
         raise ValueError("data_parallel_size_local must be > 0 in "
                          "headless mode")
 
-    if parallel_config.data_parallel_rank is not None:
-        raise ValueError("data_parallel_rank is not applicable in "
-                         "headless mode")
-
     host = parallel_config.data_parallel_master_ip
     port = engine_args.data_parallel_rpc_port  # add to config too
     handshake_address = get_tcp_uri(host, port)

From 9965c47d0d072a74252878ec3c9963a540e9a690 Mon Sep 17 00:00:00 2001
From: Louie Tsai <louie.tsai@intel.com>
Date: Wed, 2 Jul 2025 18:50:25 -0600
Subject: [PATCH 059/167] Enable CPU nightly performance benchmark and its
 Markdown report (#18444)

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
---
 .buildkite/nightly-benchmarks/README.md       |  42 ++++-
 .../performance-benchmarks-descriptions.md    |  16 +-
 .../scripts/compare-json-results.py           |  66 ++++++++
 .../convert-results-json-to-markdown.py       |  63 +++++--
 .../scripts/run-performance-benchmarks.sh     | 128 +++++++++++---
 .../tests/latency-tests-cpu.json              |  30 ++++
 .../tests/serving-tests-cpu.json              | 158 ++++++++++++++++++
 .../tests/throughput-tests-cpu.json           |  32 ++++
 docker/Dockerfile.cpu                         |   3 +-
 9 files changed, 494 insertions(+), 44 deletions(-)
 create mode 100644 .buildkite/nightly-benchmarks/scripts/compare-json-results.py
 create mode 100644 .buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
 create mode 100644 .buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
 create mode 100644 .buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json

diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index 72c52d5bb5e9b..cdf6a645147e5 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -11,7 +11,7 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performanc
 
 ## Performance benchmark quick overview
 
-**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models.
+**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
 
 **Benchmarking Duration**: about 1hr.
 
@@ -31,13 +31,27 @@ Performance benchmark will be triggered when:
 - A PR being merged into vllm.
 - Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
 
+Manually Trigger the benchmark
+
+```bash
+bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+```
+
+Runtime environment variables:
+- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
+- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
+- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
+- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
+- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
+- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
+
 Nightly benchmark will be triggered when:
 - Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
 
 ## Performance benchmark details
 
 See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
-
+> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
 ### Latency test
 
 Here is an example of one test inside `latency-tests.json`:
@@ -119,6 +133,30 @@ If you do not see the table, please wait till the benchmark finish running.
 The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
 The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
 
+The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
+When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
+`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
+
+Here is an example using the script to compare result_a and result_b without detail test name.
+`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json --ignore_test_name`
+
+|    | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
+|----|----------------------------------------|----------------------------------------|----------|
+| 0  | 142.633982                             | 156.526018                             | 1.097396 |
+| 1  | 241.620334                             | 294.018783                             | 1.216863 |
+| 2  | 218.298905                             | 262.664916                             | 1.203235 |
+| 3  | 242.743860                             | 299.816190                             | 1.235113 |
+
+Here is an example using the script to compare result_a and result_b with detail test name.
+`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
+|   | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio        |
+|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
+| 0 | serving_llama8B_tp1_sharegpt_qps_1          | 142.633982                             | serving_llama8B_tp1_sharegpt_qps_1          | 156.526018                             | 1.097396 |
+| 1 | serving_llama8B_tp1_sharegpt_qps_16         | 241.620334                             | serving_llama8B_tp1_sharegpt_qps_16         | 294.018783                             | 1.216863 |
+| 2 | serving_llama8B_tp1_sharegpt_qps_4          | 218.298905                             | serving_llama8B_tp1_sharegpt_qps_4          | 262.664916                             | 1.203235 |
+| 3 | serving_llama8B_tp1_sharegpt_qps_inf        | 242.743860                             | serving_llama8B_tp1_sharegpt_qps_inf        | 299.816190                             | 1.235113 |
+| 4 | serving_llama8B_tp2_random_1024_128_qps_1   | 96.613390                              | serving_llama8B_tp4_random_1024_128_qps_1   | 108.404853                             | 1.122048 |
+
 ## Nightly test details
 
 See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
diff --git a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
index cacaef986c658..a1f8441ccdac8 100644
--- a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
+++ b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
@@ -4,7 +4,8 @@
 - Input length: 32 tokens.
 - Output length: 128 tokens.
 - Batch size: fixed (8).
-- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- CPU Models: llama-3.1 8B.
 - Evaluation metrics: end-to-end latency (mean, median, p99).
 
 {latency_tests_markdown_table}
@@ -14,7 +15,8 @@
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm to achieve maximum throughput.
-- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- CPU Models: llama-3.1 8B.
 - Evaluation metrics: throughput.
 
 {throughput_tests_markdown_table}
@@ -25,12 +27,18 @@
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm and the arrival pattern of the requests.
 - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
-- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
-- We also added a speculative decoding test for llama-3 70B, under QPS 2
+- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
+- CPU Models: llama-3.1 8B.
 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
+- For CPU, we added random dataset tests to benchmark fixed input/output length with 100 prompts.
 
 {serving_tests_markdown_table}
 
+## Platform Information
+
+{platform_markdown_table}
+
 ## json version of the benchmarking tables
 
 This section contains the data of the markdown tables above in JSON format.
diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
new file mode 100644
index 0000000000000..20c106234935c
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+
+import pandas as pd
+
+
+def compare_data_columns(
+    files, name_column, data_column, drop_column, ignore_test_name=False
+):
+    print("\ncompare_data_column: " + data_column)
+    frames = []
+    compare_frames = []
+    for file in files:
+        data_df = pd.read_json(file)
+        serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
+        if ignore_test_name is False:
+            serving_df = serving_df.rename(columns={name_column: file + "_name"})
+            frames.append(serving_df[file + "_name"])
+        serving_df = serving_df.rename(columns={data_column: file})
+        frames.append(serving_df[file])
+        compare_frames.append(serving_df[file])
+        if len(compare_frames) >= 2:
+            # Compare numbers among two files
+            ratio_df = compare_frames[1] / compare_frames[0]
+            frames.append(ratio_df)
+            compare_frames.pop(1)
+
+    concat_df = pd.concat(frames, axis=1)
+    return concat_df
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-f", "--file", action="append", type=str, help="input file name"
+    )
+    parser.add_argument(
+        "--ignore_test_name", action="store_true", help="ignore_test_name or not"
+    )
+    args = parser.parse_args()
+    files = args.file
+    print("comparing : " + ", ".join(files))
+
+    drop_column = "P99"
+    name_column = "Test name"
+    data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
+    html_msgs_for_data_cols = [
+        "Compare Output Tokens /n",
+        "Median TTFT /n",
+        "Median TPOT /n",
+    ]
+    ignore_test_name = args.ignore_test_name
+    with open("perf_comparison.html", "w") as text_file:
+        for i in range(len(data_cols_to_compare)):
+            output_df = compare_data_columns(
+                files,
+                name_column,
+                data_cols_to_compare[i],
+                drop_column,
+                ignore_test_name=ignore_test_name,
+            )
+            print(output_df)
+            html = output_df.to_html()
+            text_file.write(html_msgs_for_data_cols[i])
+            text_file.write(html)
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index a4f1638c1adb8..724b53056ca8f 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -3,9 +3,11 @@
 
 import json
 import os
+from importlib import util
 from pathlib import Path
 
 import pandas as pd
+import psutil
 from tabulate import tabulate
 
 results_folder = Path("results/")
@@ -29,11 +31,11 @@ throughput_results = []
 throughput_results_column_mapping = {
     "test_name": "Test name",
     "gpu_type": "GPU",
-    # "num_requests": "# of req.",
-    # "total_num_tokens": "Total # of tokens",
-    # "elapsed_time": "Elapsed time (s)",
+    "num_requests": "# of req.",
+    "total_num_tokens": "Total # of tokens",
+    "elapsed_time": "Elapsed time (s)",
     "requests_per_second": "Tput (req/s)",
-    # "tokens_per_second": "Tput (tok/s)",
+    "tokens_per_second": "Tput (tok/s)",
 }
 
 # serving results and the keys that will be printed into markdown
@@ -41,16 +43,18 @@ serving_results = []
 serving_column_mapping = {
     "test_name": "Test name",
     "gpu_type": "GPU",
-    # "completed": "# of req.",
+    "completed": "# of req.",
     "request_throughput": "Tput (req/s)",
-    # "input_throughput": "Input Tput (tok/s)",
-    # "output_throughput": "Output Tput (tok/s)",
+    "total_token_throughput": "Total Token Tput (tok/s)",
+    "output_throughput": "Output Tput (tok/s)",
+    "total_input_tokens": "Total input tokens",
+    "total_output_tokens": "Total output tokens",
     "mean_ttft_ms": "Mean TTFT (ms)",
     "median_ttft_ms": "Median TTFT (ms)",
     "p99_ttft_ms": "P99 TTFT (ms)",
-    # "mean_tpot_ms": "Mean TPOT (ms)",
-    # "median_tpot_ms": "Median",
-    # "p99_tpot_ms": "P99",
+    "mean_tpot_ms": "Mean TPOT (ms)",
+    "median_tpot_ms": "Median",
+    "p99_tpot_ms": "P99",
     "mean_itl_ms": "Mean ITL (ms)",
     "median_itl_ms": "Median ITL (ms)",
     "p99_itl_ms": "P99 ITL (ms)",
@@ -75,6 +79,20 @@ def results_to_json(latency, throughput, serving):
     )
 
 
+def get_size_with_unit(bytes, suffix="B"):
+    """
+    Scale bytes to its proper format
+    e.g:
+        1253656 => '1.20MB'
+        1253656678 => '1.17GB'
+    """
+    factor = 1024
+    for unit in ["", "K", "M", "G", "T", "P"]:
+        if bytes < factor:
+            return f"{bytes:.2f}{unit}{suffix}"
+        bytes /= factor
+
+
 if __name__ == "__main__":
     # collect results
     for test_file in results_folder.glob("*.json"):
@@ -155,6 +173,27 @@ if __name__ == "__main__":
     serving_results = pd.DataFrame.from_dict(serving_results)
     throughput_results = pd.DataFrame.from_dict(throughput_results)
 
+    svmem = psutil.virtual_memory()
+    platform_data = {
+        "Physical cores": [psutil.cpu_count(logical=False)],
+        "Total cores": [psutil.cpu_count(logical=True)],
+        "Total Memory": [get_size_with_unit(svmem.total)],
+    }
+
+    if util.find_spec("numa") is not None:
+        from numa import info
+
+        platform_data["Total NUMA nodes"] = [info.get_num_configured_nodes()]
+
+    if util.find_spec("cpuinfo") is not None:
+        from cpuinfo import get_cpu_info
+
+        platform_data["CPU Brand"] = [get_cpu_info()["brand_raw"]]
+
+    platform_results = pd.DataFrame.from_dict(
+        platform_data, orient="index", columns=["Platform Info"]
+    )
+
     raw_results_json = results_to_json(
         latency_results, throughput_results, serving_results
     )
@@ -200,6 +239,9 @@ if __name__ == "__main__":
     throughput_md_table = tabulate(
         throughput_results, headers="keys", tablefmt="pipe", showindex=False
     )
+    platform_md_table = tabulate(
+        platform_results, headers="keys", tablefmt="pipe", showindex=True
+    )
 
     # document the result
     with open(results_folder / "benchmark_results.md", "w") as f:
@@ -211,6 +253,7 @@ if __name__ == "__main__":
             latency_tests_markdown_table=latency_md_table,
             throughput_tests_markdown_table=throughput_md_table,
             serving_tests_markdown_table=serving_md_table,
+            platform_markdown_table=platform_md_table,
             benchmarking_results_in_json_string=processed_results_json,
         )
         f.write(results)
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index 80ebb370ad461..f05040618981c 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -31,6 +31,20 @@ check_gpus() {
   echo "GPU type is $gpu_type"
 }
 
+check_cpus() {
+  # check the number of CPUs and NUMA Node and GPU type.
+  declare -g numa_count=$(python3 -c  "from numa import info;numa_size = info.get_num_configured_nodes(); print(numa_size)")
+  if [[ $numa_count -gt 0 ]]; then
+    echo "NUMA found."
+    echo $numa_count
+  else
+    echo "Need at least 1 NUMA to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type="cpu"
+  echo "GPU type is $gpu_type"
+}
+
 check_hf_token() {
   # check if HF_TOKEN is available and valid
   if [[ -z "$HF_TOKEN" ]]; then
@@ -69,6 +83,22 @@ json2args() {
   echo "$args"
 }
 
+json2envs() {
+  # transforms the JSON string to environment variables.
+  # example:
+  # input: { "VLLM_CPU_KVCACHE_SPACE": 5 }
+  # output: VLLM_CPU_KVCACHE_SPACE=5
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map((.key ) + "=" + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
 wait_for_server() {
   # wait for vllm server to start
   # return 1 if vllm server crashes
@@ -158,15 +188,24 @@ run_latency_tests() {
     # get arguments
     latency_params=$(echo "$params" | jq -r '.parameters')
     latency_args=$(json2args "$latency_params")
+    latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
+    latency_envs=$(json2envs "$latency_environment_variables")
 
     # check if there is enough GPU to run the test
     tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
+    if [ "$ON_CPU" == "1" ];then
+      if [[ $numa_count -lt $tp ]]; then
+        echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
+        continue
+      fi
+    else
+      if [[ $gpu_count -lt $tp ]]; then
+        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+        continue
+      fi
     fi
 
-    latency_command="python3 benchmark_latency.py \
+    latency_command=" $latency_envs python3 benchmark_latency.py \
       --output-json $RESULTS_FOLDER/${test_name}.json \
       $latency_args"
 
@@ -216,15 +255,24 @@ run_throughput_tests() {
     # get arguments
     throughput_params=$(echo "$params" | jq -r '.parameters')
     throughput_args=$(json2args "$throughput_params")
+    throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
+    throughput_envs=$(json2envs "$throughput_environment_variables")
 
     # check if there is enough GPU to run the test
     tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
+    if [ "$ON_CPU" == "1" ];then
+      if [[ $numa_count -lt $tp ]]; then
+        echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
+        continue
+      fi
+    else
+      if [[ $gpu_count -lt $tp ]]; then
+        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+        continue
+      fi
     fi
 
-    throughput_command="python3 benchmark_throughput.py \
+    throughput_command=" $throughput_envs python3 benchmark_throughput.py \
       --output-json $RESULTS_FOLDER/${test_name}.json \
       $throughput_args"
 
@@ -272,18 +320,27 @@ run_serving_tests() {
 
     # get client and server arguments
     server_params=$(echo "$params" | jq -r '.server_parameters')
+    server_envs=$(echo "$params" | jq -r '.server_environment_variables')
     client_params=$(echo "$params" | jq -r '.client_parameters')
     server_args=$(json2args "$server_params")
+    server_envs=$(json2envs "$server_envs")
     client_args=$(json2args "$client_params")
     qps_list=$(echo "$params" | jq -r '.qps_list')
     qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
     echo "Running over qps list $qps_list"
 
-    # check if there is enough GPU to run the test
+    # check if there is enough resources to run the test
     tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
+    if [ "$ON_CPU" == "1" ];then
+      if [[ $numa_count -lt $tp ]]; then
+        echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
+        continue
+      fi
+    else
+      if [[ $gpu_count -lt $tp ]]; then
+        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+        continue
+      fi
     fi
 
     # check if server model and client model is aligned
@@ -294,23 +351,33 @@ run_serving_tests() {
       continue
     fi
 
-    server_command="python3 \
+    server_command="$server_envs python3 \
       -m vllm.entrypoints.openai.api_server \
       $server_args"
 
     # run the server
     echo "Running test case $test_name"
     echo "Server command: $server_command"
-    bash -c "$server_command" &
-    server_pid=$!
-
-    # wait until the server is alive
-    if wait_for_server; then
-      echo ""
-      echo "vllm server is up and running."
+    # support remote vllm server
+    client_remote_args=""
+    if [[ -z "${REMOTE_HOST}" ]]; then
+      bash -c "$server_command" &
+      server_pid=$!
+      # wait until the server is alive
+      if wait_for_server; then
+        echo ""
+        echo "vLLM server is up and running."
+      else
+        echo ""
+        echo "vLLM failed to start within the timeout period."
+      fi
     else
-      echo ""
-      echo "vllm failed to start within the timeout period."
+      server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
+      if [[ ${REMOTE_PORT} ]]; then
+        client_remote_args=" --host=$REMOTE_HOST --port=$REMOTE_PORT "
+      else
+        client_remote_args=" --host=$REMOTE_HOST "
+      fi
     fi
 
     # iterate over different QPS
@@ -332,7 +399,7 @@ run_serving_tests() {
         --result-filename ${new_test_name}.json \
         --request-rate $qps \
         --metadata "tensor_parallel_size=$tp" \
-        $client_args"
+        $client_args $client_remote_args "
 
       echo "Running test case $test_name with qps $qps"
       echo "Client command: $client_command"
@@ -360,7 +427,14 @@ run_serving_tests() {
 }
 
 main() {
-  check_gpus
+  local ARCH
+  ARCH=''
+  if [ "$ON_CPU" == "1" ];then
+     check_cpus
+     ARCH='-cpu'
+  else
+     check_gpus
+  fi
   check_hf_token
 
   # Set to v1 to run v1 benchmark
@@ -386,9 +460,9 @@ main() {
   QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
 
   # benchmarking
-  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
-  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
-  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
+  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
+  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
+  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
 
   # postprocess benchmarking results
   pip install tabulate pandas
diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
new file mode 100644
index 0000000000000..da93fdd1dbac1
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
@@ -0,0 +1,30 @@
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    },
+    {
+        "test_name": "latency_llama8B_tp4",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    }
+]
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
new file mode 100644
index 0000000000000..22f71c993ff33
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
@@ -0,0 +1,158 @@
+[
+    {
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+	    "enforce_eager": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp2_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+	    "enforce_eager": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp4_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+	    "enforce_eager": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp4_random_1024_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+	    "enforce_eager": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 1024,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+	    "max_concurrency": 100,
+            "num_prompts": 100
+        }
+    },
+    {
+        "test_name": "serving_llama8B_pp6_random_1024_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "pipeline_parallel_size": 6,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+	    "enforce_eager": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 1024,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+	    "max_concurrency": 100,
+            "num_prompts": 100
+        }
+    }
+]
diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
new file mode 100644
index 0000000000000..f159c30637d34
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
@@ -0,0 +1,32 @@
+[
+    {
+        "test_name": "throughput_llama8B_tp1",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    },
+    {
+        "test_name": "throughput_llama8B_tp4",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    }
+]
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 13bd03c5696ab..0a756ea7298c0 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -25,7 +25,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     --mount=type=cache,target=/var/lib/apt,sharing=locked \
     apt-get update -y \
     && apt-get install -y --no-install-recommends ccache git curl wget ca-certificates \
-        gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 \
+        gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \
     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
     && curl -LsSf https://astral.sh/uv/install.sh | sh
 
@@ -134,6 +134,7 @@ ADD ./tests/ ./tests/
 ADD ./examples/ ./examples/
 ADD ./benchmarks/ ./benchmarks/
 ADD ./vllm/collect_env.py .
+ADD ./.buildkite/ ./.buildkite/
 
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \

From 2e25bb12a8f3b6fd8c2edcb5ae21c6c1cd315dbe Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Wed, 2 Jul 2025 22:07:43 -0400
Subject: [PATCH 060/167] [Bugfix] Fix import of CutlassExpertsFp8 in
 compressed_tensors_moe.py (#20381)

Signed-off-by: Bill Nell <bnell@redhat.com>
---
 .../compressed_tensors/compressed_tensors_moe.py       |  8 +++++---
 vllm/model_executor/layers/quantization/fp8.py         | 10 ++++++----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index fa011266cf2fe..5d7e00c2b81be 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -14,9 +14,9 @@ import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (
-    CutlassExpertsFp8, FusedMoE, FusedMoEActivationFormat, FusedMoEConfig,
-    FusedMoEMethodBase, FusedMoEPermuteExpertsUnpermute,
-    FusedMoEPrepareAndFinalize, FusedMoeWeightScaleSupported, fused_experts)
+    FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase,
+    FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize,
+    FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import (  # noqa
     WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
@@ -570,6 +570,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             del layer.w2_input_scale
             self.fused_experts_func = None
         else:
+            from vllm.model_executor.layers.fused_moe import fused_experts
             self.fused_experts_func = fused_experts
 
     def apply(
@@ -826,6 +827,7 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
         prepare_finalize: FusedMoEPrepareAndFinalize,
         moe: FusedMoEConfig,
     ) -> FusedMoEPermuteExpertsUnpermute:
+        from vllm.model_executor.layers.fused_moe import CutlassExpertsFp8
 
         use_batched_format = (prepare_finalize.activation_format ==
                               FusedMoEActivationFormat.BatchedExperts)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 0295f5e2a1c88..f879d0ad091a8 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -14,10 +14,9 @@ from vllm import _custom_ops as ops
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (
-    BatchedTritonOrDeepGemmExperts, FusedMoE, FusedMoEActivationFormat,
-    FusedMoEConfig, FusedMoEMethodBase, FusedMoEPermuteExpertsUnpermute,
-    FusedMoEPrepareAndFinalize, FusedMoeWeightScaleSupported,
-    TritonOrDeepGemmExperts)
+    FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase,
+    FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize,
+    FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -785,6 +784,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         prepare_finalize: FusedMoEPrepareAndFinalize,
         moe: FusedMoEConfig,
     ) -> FusedMoEPermuteExpertsUnpermute:
+        from vllm.model_executor.layers.fused_moe import (
+            BatchedTritonOrDeepGemmExperts, TritonOrDeepGemmExperts)
+
         assert not self.use_marlin and not self.rocm_aiter_moe_enabled, (
             "Marlin and ROCm AITER are not supported with all2all yet.")
 

From b616f6a53dc0caaf1eeb4be785df76415dde8633 Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Wed, 2 Jul 2025 20:10:39 -0700
Subject: [PATCH 061/167] [Misc] Small: Fix video loader return type
 annotations. (#20389)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
---
 tests/multimodal/test_utils.py |  7 ++++---
 vllm/multimodal/utils.py       |  4 ++--
 vllm/multimodal/video.py       | 14 ++++++++------
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index d927ae5cd0b27..b642e5c0ad47e 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -172,9 +172,10 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
             "num_frames": num_frames,
         }})
 
-    video_sync = connector.fetch_video(video_url)
-    video_async = await connector.fetch_video_async(video_url)
-    assert np.array_equal(video_sync[0], video_async[0])
+    video_sync, metadata_sync = connector.fetch_video(video_url)
+    video_async, metadata_async = await connector.fetch_video_async(video_url)
+    assert np.array_equal(video_sync, video_async)
+    assert metadata_sync == metadata_async
 
 
 # Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 2f2be59a1f42d..22e696141b84b 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -228,7 +228,7 @@ class MediaConnector:
         video_url: str,
         *,
         image_mode: str = "RGB",
-    ) -> npt.NDArray:
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
         """
         Load video from a HTTP or base64 data URL.
         """
@@ -248,7 +248,7 @@ class MediaConnector:
         video_url: str,
         *,
         image_mode: str = "RGB",
-    ) -> npt.NDArray:
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
         """
         Asynchronously load video from a HTTP or base64 data URL.
 
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index d9589068a203b..ef1380bdb614c 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -6,6 +6,7 @@ from abc import abstractmethod
 from functools import partial
 from io import BytesIO
 from pathlib import Path
+from typing import Any
 
 import numpy as np
 import numpy.typing as npt
@@ -57,7 +58,7 @@ class VideoLoader:
     def load_bytes(cls,
                    data: bytes,
                    num_frames: int = -1,
-                   **kwargs) -> npt.NDArray:
+                   **kwargs) -> tuple[npt.NDArray, dict[str, Any]]:
         raise NotImplementedError
 
 
@@ -106,7 +107,7 @@ class OpenCVVideoBackend(VideoLoader):
     def load_bytes(cls,
                    data: bytes,
                    num_frames: int = -1,
-                   **kwargs) -> npt.NDArray:
+                   **kwargs) -> tuple[npt.NDArray, dict[str, Any]]:
         import cv2
 
         backend = cls().get_cv2_video_api()
@@ -179,12 +180,13 @@ class VideoMediaIO(MediaIO[npt.NDArray]):
         video_loader_backend = envs.VLLM_VIDEO_LOADER_BACKEND
         self.video_loader = VIDEO_LOADER_REGISTRY.load(video_loader_backend)
 
-    def load_bytes(self, data: bytes) -> npt.NDArray:
+    def load_bytes(self, data: bytes) -> tuple[npt.NDArray, dict[str, Any]]:
         return self.video_loader.load_bytes(data,
                                             num_frames=self.num_frames,
                                             **self.kwargs)
 
-    def load_base64(self, media_type: str, data: str) -> npt.NDArray:
+    def load_base64(self, media_type: str,
+                    data: str) -> tuple[npt.NDArray, dict[str, Any]]:
         if media_type.lower() == "video/jpeg":
             load_frame = partial(
                 self.image_io.load_base64,
@@ -194,11 +196,11 @@ class VideoMediaIO(MediaIO[npt.NDArray]):
             return np.stack([
                 np.asarray(load_frame(frame_data))
                 for frame_data in data.split(",")
-            ])
+            ]), {}
 
         return self.load_bytes(base64.b64decode(data))
 
-    def load_file(self, filepath: Path) -> npt.NDArray:
+    def load_file(self, filepath: Path) -> tuple[npt.NDArray, dict[str, Any]]:
         with filepath.open("rb") as f:
             data = f.read()
 

From 0ec3779df74ff68ab920856234e2a1aafc21f1a1 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Thu, 3 Jul 2025 11:11:36 +0800
Subject: [PATCH 062/167] [Bugfix][CI/CD][CPU] Fix CPU CI tests (#20383)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .../layers/quantization/utils/fp8_utils.py          | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index c38a445c571b8..cbf8231defc6c 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -201,12 +201,13 @@ def apply_w8a8_block_fp8_linear_fake(
     return torch.empty(output_shape, dtype=input.dtype, device=input.device)
 
 
-direct_register_custom_op(
-    op_name="apply_w8a8_block_fp8_linear",
-    op_func=apply_w8a8_block_fp8_linear,
-    mutates_args=[],
-    fake_impl=apply_w8a8_block_fp8_linear_fake,
-)
+if not current_platform.is_cpu():
+    direct_register_custom_op(
+        op_name="apply_w8a8_block_fp8_linear",
+        op_func=apply_w8a8_block_fp8_linear,
+        mutates_args=[],
+        fake_impl=apply_w8a8_block_fp8_linear_fake,
+    )
 
 
 def input_to_float8(

From 4ff61ababa25f4a519185013c9cce00142341f04 Mon Sep 17 00:00:00 2001
From: QiliangCui <derrhein@gmail.com>
Date: Wed, 2 Jul 2025 23:46:41 -0700
Subject: [PATCH 063/167] [TPU] Add a case to cover
 RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 (#20385)

Signed-off-by: Qiliang Cui <derrhein@gmail.com>
---
 .buildkite/scripts/tpu/quantized_v6e_1.env | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 .buildkite/scripts/tpu/quantized_v6e_1.env

diff --git a/.buildkite/scripts/tpu/quantized_v6e_1.env b/.buildkite/scripts/tpu/quantized_v6e_1.env
new file mode 100644
index 0000000000000..bab34b3be3b9a
--- /dev/null
+++ b/.buildkite/scripts/tpu/quantized_v6e_1.env
@@ -0,0 +1,14 @@
+# Environment config
+TEST_NAME=llama8bw8a8
+CONTAINER_NAME=vllm-tpu
+
+# vllm config
+MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8
+MAX_NUM_SEQS=128
+MAX_NUM_BATCHED_TOKENS=1024
+TENSOR_PARALLEL_SIZE=1
+MAX_MODEL_LEN=2048
+DOWNLOAD_DIR=/mnt/disks/persist
+EXPECTED_THROUGHPUT=10.0
+INPUT_LEN=1800
+OUTPUT_LEN=128

From 363528de27db30c2cde6c9abdf3da1544e8c1fe4 Mon Sep 17 00:00:00 2001
From: qscqesze <qingjun@minimaxi.com>
Date: Thu, 3 Jul 2025 14:48:27 +0800
Subject: [PATCH 064/167] [Feature] Support MiniMax-M1 function calls features
 (#20297)

Signed-off-by: QscQ <qscqesze@gmail.com>
Signed-off-by: qingjun <qingjun@minimaxi.com>
---
 docs/features/tool_calling.md                 |   9 +
 examples/tool_chat_template_minimax_m1.jinja  |  91 +++++
 tests/tool_use/test_minimax_tool_parser.py    | 371 ++++++++++++++++++
 .../openai/tool_parsers/__init__.py           |   3 +-
 .../tool_parsers/minimax_tool_parser.py       | 369 +++++++++++++++++
 5 files changed, 842 insertions(+), 1 deletion(-)
 create mode 100644 examples/tool_chat_template_minimax_m1.jinja
 create mode 100644 tests/tool_use/test_minimax_tool_parser.py
 create mode 100644 vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 29f4b2300bbef..8858b9a4015a4 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -264,6 +264,15 @@ For Qwen2.5, the chat template in tokenizer_config.json has already included sup
 
 Flags: `--tool-call-parser hermes`
 
+### MiniMax Models (`minimax_m1`)
+
+Supported models:
+
+* `MiniMaxAi/MiniMax-M1-40k` (use with <gh-file:examples/tool_chat_template_minimax.jinja>)
+* `MiniMaxAi/MiniMax-M1-80k` (use with <gh-file:examples/tool_chat_template_minimax.jinja>)
+
+Flags: `--tool-call-parser minimax --chat-template examples/tool_chat_template_minimax.jinja`
+
 ### DeepSeek-V3 Models (`deepseek_v3`)
 
 Supported models:
diff --git a/examples/tool_chat_template_minimax_m1.jinja b/examples/tool_chat_template_minimax_m1.jinja
new file mode 100644
index 0000000000000..2d5bbf4de56fb
--- /dev/null
+++ b/examples/tool_chat_template_minimax_m1.jinja
@@ -0,0 +1,91 @@
+{{ '<begin_of_document>' -}}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- Extract system message #}
+{% set ns = namespace(system_prompt='') -%}
+{%- if messages[0]['role'] == 'system' %}
+    {%- if messages[0]['content'] is string %}
+        {%- set ns.system_prompt = messages[0]['content']|trim %}
+    {%- else %}
+        {%- set ns.system_prompt = messages[0]['content'][0]['text']|trim %}
+    {%- endif %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- if tools is not none %}
+        {%- set ns.system_prompt = "You are a helpful assistant created by Minimax based on MiniMax-M1 model." %}
+    {%- else %}
+        {%- set ns.system_prompt = "You are a helpful assistant created by Minimax based on MiniMax-M1 model." %}
+    {%- endif %}
+{%- endif %}
+
+{#- System message #}
+{%- if ns.system_prompt != '' %}
+{{ '<beginning_of_sentence>system ai_setting=assistant\n' + ns.system_prompt + '<end_of_sentence>\n' -}}
+{%- endif %}
+
+{#- Tools configuration #}
+{%- if tools is not none %}
+{{ '<beginning_of_sentence>system tool_setting=tools\nYou are provided with these tools:\n<tools>\n' -}}
+{%- for tool in tools %}
+{{ tool | tojson ~ '\n' -}}
+{%- endfor %}
+{{ '</tools>\n\nIf you need to call tools, please respond with <tool_calls></tool_calls> XML tags, and provide tool-name and json-object of arguments, following the format below:\n<tool_calls>\n{"name": <tool-name>, "arguments": <args-json-object>}\n...\n</tool_calls><end_of_sentence>\n' -}}
+{%- endif %}
+
+{#- Process messages #}
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {%- if message['role'] == 'user' %}
+{{ '<beginning_of_sentence>user name=user\n' -}}
+{%- if message['content'] is string %}
+{{ message['content']|trim -}}
+{%- else %}
+{%- for content in message['content'] %}
+{%- if content['type'] == 'text' %}
+{{ content['text']|trim -}}
+{%- endif %}
+{%- endfor %}
+{%- endif %}
+{{ '<end_of_sentence>\n' -}}
+        {%- elif message['role'] == 'assistant' %}
+{{ '<beginning_of_sentence>ai name=assistant\n' -}}
+{%- if message['content'] is string %}
+{{ message['content']|trim -}}
+{%- else %}
+{%- for content in message['content'] | selectattr('type', 'equalto', 'text') %}
+{{ content['text']|trim -}}
+{%- endfor %}
+{%- endif %}
+{{ '<end_of_sentence>\n' -}}
+        {%- endif %}
+    {%- elif 'tool_calls' in message %}
+{{ '<beginning_of_sentence>ai name=assistant\n<tool_calls>\n' -}}
+{%- for tool_call in message.tool_calls %}
+{{ '{"name": "' + tool_call.function.name + '", "arguments": ' + tool_call.function.arguments | tojson + '}\n' -}}
+{%- endfor %}
+{{ '</tool_calls><end_of_sentence>\n' -}}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+{{ '<beginning_of_sentence>tool name=tools\n' -}}
+{%- if message.content is string %}
+{{ 'tool result: ' + message.content + '\n\n' -}}
+{%- else %}
+{%- for content in message['content'] %}
+{%- if content['type'] == 'text' %}
+{{ 'tool result: ' + content['text'] + '\n\n' -}}
+{%- elif content.get('name') %}
+{{ 'tool name: ' + content['name'] + '\ntool result: ' + content['text'] + '\n\n' -}}
+{%- endif %}
+{%- endfor %}
+{%- endif %}
+{{ '<end_of_sentence>\n' -}}
+    {%- endif %}
+{%- endfor %}
+
+{%- if add_generation_prompt %}
+{{ '<beginning_of_sentence>ai name=assistant\n' -}}
+{%- endif %}
\ No newline at end of file
diff --git a/tests/tool_use/test_minimax_tool_parser.py b/tests/tool_use/test_minimax_tool_parser.py
new file mode 100644
index 0000000000000..0c9a574e03dc7
--- /dev/null
+++ b/tests/tool_use/test_minimax_tool_parser.py
@@ -0,0 +1,371 @@
+# SPDX-License-Identifier: Apache-2.0
+# ruff: noqa: E501
+
+import json
+
+import pytest
+
+from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
+from vllm.entrypoints.openai.tool_parsers import MinimaxToolParser
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+# Use a common model that is likely to be available
+MODEL = "MiniMaxAi/MiniMax-M1-40k"
+
+
+@pytest.fixture(scope="module")
+def minimax_tokenizer():
+    return get_tokenizer(tokenizer_name=MODEL)
+
+
+@pytest.fixture
+def minimax_tool_parser(minimax_tokenizer):
+    return MinimaxToolParser(minimax_tokenizer)
+
+
+def assert_tool_calls(actual_tool_calls: list[ToolCall],
+                      expected_tool_calls: list[ToolCall]):
+    assert len(actual_tool_calls) == len(expected_tool_calls)
+
+    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
+                                                    expected_tool_calls):
+        assert isinstance(actual_tool_call.id, str)
+        assert len(actual_tool_call.id) > 16
+
+        assert actual_tool_call.type == "function"
+        assert actual_tool_call.function == expected_tool_call.function
+
+
+def test_extract_tool_calls_no_tools(minimax_tool_parser):
+    model_output = "This is a test"
+    extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+    assert not extracted_tool_calls.tools_called
+    assert extracted_tool_calls.tool_calls == []
+    assert extracted_tool_calls.content == model_output
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "single_tool_call",
+        "multiple_tool_calls",
+        "tool_call_with_content_before",
+        "tool_call_with_single_line_json",
+        "tool_call_incomplete_tag",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        (
+            """<tool_calls>
+{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}
+</tool_calls>""",
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Dallas",
+                        "state": "TX",
+                        "unit": "fahrenheit",
+                    }),
+                ))
+            ],
+            None,
+        ),
+        (
+            """<tool_calls>
+{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}
+{"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}
+</tool_calls>""",
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Dallas",
+                        "state": "TX",
+                        "unit": "fahrenheit",
+                    }),
+                )),
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Orlando",
+                        "state": "FL",
+                        "unit": "fahrenheit",
+                    }),
+                )),
+            ],
+            None,
+        ),
+        (
+            """I'll help you check the weather. <tool_calls>
+{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}
+</tool_calls>""",
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Seattle",
+                        "state": "WA",
+                        "unit": "celsius",
+                    }),
+                ))
+            ],
+            "I'll help you check the weather.",
+        ),
+        (
+            """<tool_calls>
+{"name": "get_current_weather", "arguments": {"city": "New York", "state": "NY", "unit": "celsius"}}
+</tool_calls>""",
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "New York",
+                        "state": "NY",
+                        "unit": "celsius",
+                    }),
+                ))
+            ],
+            None,
+        ),
+        (
+            """<tool_calls>
+{"name": "get_current_weather", "arguments": {"city": "Boston", "state": "MA"}}""",
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Boston",
+                        "state": "MA",
+                    }),
+                ))
+            ],
+            None,
+        ),
+    ],
+)
+def test_extract_tool_calls(minimax_tool_parser, model_output,
+                            expected_tool_calls, expected_content):
+    extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+    assert extracted_tool_calls.tools_called
+
+    assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
+
+    assert extracted_tool_calls.content == expected_content
+
+
+def test_preprocess_model_output_with_thinking_tags(minimax_tool_parser):
+    """Test that tool calls within thinking tags are removed during preprocessing."""
+    model_output = """<think>Let me think about this. <tool_calls>
+{"name": "fake_tool", "arguments": {"param": "value"}}
+</tool_calls> This should be removed.</think>
+
+I'll help you with that. <tool_calls>
+{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA"}}
+</tool_calls>"""
+
+    processed_output = minimax_tool_parser.preprocess_model_output(
+        model_output)
+
+    # The tool call within thinking tags should be removed
+    assert "fake_tool" not in processed_output
+    # But the thinking tag itself should remain
+    assert "<think>" in processed_output
+    assert "</think>" in processed_output
+    # The actual tool call outside thinking tags should remain
+    assert "get_current_weather" in processed_output
+
+
+def test_extract_tool_calls_with_thinking_tags(minimax_tool_parser):
+    """Test tool extraction when thinking tags contain tool calls that should be ignored."""
+    model_output = """<think>I should use a tool. <tool_calls>
+{"name": "ignored_tool", "arguments": {"should": "ignore"}}
+</tool_calls></think>
+
+Let me help you with the weather. <tool_calls>
+{"name": "get_current_weather", "arguments": {"city": "Miami", "state": "FL", "unit": "fahrenheit"}}
+</tool_calls>"""
+
+    extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+
+    assert extracted_tool_calls.tools_called
+    assert len(extracted_tool_calls.tool_calls) == 1
+    assert extracted_tool_calls.tool_calls[
+        0].function.name == "get_current_weather"
+
+    # Content extraction is based on the position of the first <tool_calls> in the original model_output
+    # Since preprocessing removes tool calls within thinking tags, the actual first <tool_calls> is the external one
+    expected_content = """<think>I should use a tool. <tool_calls>
+{"name": "ignored_tool", "arguments": {"should": "ignore"}}
+</tool_calls></think>
+
+Let me help you with the weather."""
+    assert extracted_tool_calls.content == expected_content
+
+
+def test_extract_tool_calls_invalid_json(minimax_tool_parser):
+    """Test that invalid JSON in tool calls is handled gracefully."""
+    model_output = """<tool_calls>
+{"name": "valid_tool", "arguments": {"city": "Seattle"}}
+{invalid json here}
+{"name": "another_valid_tool", "arguments": {"param": "value"}}
+</tool_calls>"""
+
+    extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+
+    assert extracted_tool_calls.tools_called
+    # Should extract only the valid JSON tool calls
+    assert len(extracted_tool_calls.tool_calls) == 2
+    assert extracted_tool_calls.tool_calls[0].function.name == "valid_tool"
+    assert extracted_tool_calls.tool_calls[
+        1].function.name == "another_valid_tool"
+
+
+def test_extract_tool_calls_missing_name_or_arguments(minimax_tool_parser):
+    """Test that tool calls missing name or arguments are filtered out."""
+    model_output = """<tool_calls>
+{"name": "valid_tool", "arguments": {"city": "Seattle"}}
+{"name": "missing_args"}
+{"arguments": {"city": "Portland"}}
+{"name": "another_valid_tool", "arguments": {"param": "value"}}
+</tool_calls>"""
+
+    extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+
+    assert extracted_tool_calls.tools_called
+    # Should extract only the valid tool calls with both name and arguments
+    assert len(extracted_tool_calls.tool_calls) == 2
+    assert extracted_tool_calls.tool_calls[0].function.name == "valid_tool"
+    assert extracted_tool_calls.tool_calls[
+        1].function.name == "another_valid_tool"
+
+
+def test_streaming_basic_functionality(minimax_tool_parser):
+    """Test basic streaming functionality."""
+    # Reset streaming state
+    minimax_tool_parser.current_tool_name_sent = False
+    minimax_tool_parser.prev_tool_call_arr = []
+    minimax_tool_parser.current_tool_id = -1
+    minimax_tool_parser.streamed_args_for_tool = []
+
+    # Test with a simple tool call
+    current_text = """<tool_calls>
+{"name": "get_current_weather", "arguments": {"city": "Seattle"}}
+</tool_calls>"""
+
+    # First call should handle the initial setup
+    result = minimax_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text=current_text,
+        delta_text="</tool_calls>",
+        previous_token_ids=[],
+        current_token_ids=[],
+        delta_token_ids=[],
+        request=None,
+    )
+
+    # The result might be None or contain tool call information
+    # This depends on the internal state management
+    if result is not None and hasattr(result,
+                                      'tool_calls') and result.tool_calls:
+        assert len(result.tool_calls) >= 0
+
+
+def test_streaming_with_content_before_tool_calls(minimax_tool_parser):
+    """Test streaming when there's content before tool calls."""
+    # Reset streaming state
+    minimax_tool_parser.current_tool_name_sent = False
+    minimax_tool_parser.prev_tool_call_arr = []
+    minimax_tool_parser.current_tool_id = -1
+    minimax_tool_parser.streamed_args_for_tool = []
+
+    current_text = "I'll help you with that. <tool_calls>"
+
+    # When there's content before tool calls, it should be returned as content
+    result = minimax_tool_parser.extract_tool_calls_streaming(
+        previous_text="I'll help you",
+        current_text=current_text,
+        delta_text=" with that. <tool_calls>",
+        previous_token_ids=[],
+        current_token_ids=[],
+        delta_token_ids=[],
+        request=None,
+    )
+
+    if result is not None and hasattr(result, 'content'):
+        # Should contain some content
+        assert result.content is not None
+
+
+def test_streaming_no_tool_calls(minimax_tool_parser):
+    """Test streaming when there are no tool calls."""
+    current_text = "This is just regular text without any tool calls."
+
+    result = minimax_tool_parser.extract_tool_calls_streaming(
+        previous_text="This is just regular text",
+        current_text=current_text,
+        delta_text=" without any tool calls.",
+        previous_token_ids=[],
+        current_token_ids=[],
+        delta_token_ids=[],
+        request=None,
+    )
+
+    # Should return the delta text as content
+    assert result is not None
+    assert hasattr(result, 'content')
+    assert result.content == " without any tool calls."
+
+
+def test_streaming_with_thinking_tags(minimax_tool_parser):
+    """Test streaming with thinking tags that contain tool calls."""
+    # Reset streaming state
+    minimax_tool_parser.current_tool_name_sent = False
+    minimax_tool_parser.prev_tool_call_arr = []
+    minimax_tool_parser.current_tool_id = -1
+    minimax_tool_parser.streamed_args_for_tool = []
+
+    current_text = """<think><tool_calls>{"name": "ignored", "arguments": {}}</tool_calls></think><tool_calls>{"name": "real_tool", "arguments": {"param": "value"}}</tool_calls>"""
+
+    result = minimax_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text=current_text,
+        delta_text=current_text,
+        previous_token_ids=[],
+        current_token_ids=[],
+        delta_token_ids=[],
+        request=None,
+    )
+
+    # The preprocessing should remove tool calls from thinking tags
+    # and only process the real tool call
+    if result is not None and hasattr(result,
+                                      'tool_calls') and result.tool_calls:
+        for tool_call in result.tool_calls:
+            assert tool_call.function.name != "ignored"
+
+
+def test_extract_tool_calls_multiline_json_not_supported(minimax_tool_parser):
+    """Test that multiline JSON in tool calls is not currently supported."""
+    model_output = """<tool_calls>
+{
+  "name": "get_current_weather",
+  "arguments": {
+    "city": "New York",
+    "state": "NY",
+    "unit": "celsius"
+  }
+}
+</tool_calls>"""
+
+    extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+
+    # Multiline JSON is currently not supported, should return no tools called
+    assert not extracted_tool_calls.tools_called
+    assert extracted_tool_calls.tool_calls == []
+    assert extracted_tool_calls.content is None
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index 46bd665e767dd..57e675515e12e 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -10,6 +10,7 @@ from .internlm2_tool_parser import Internlm2ToolParser
 from .jamba_tool_parser import JambaToolParser
 from .llama4_pythonic_tool_parser import Llama4PythonicToolParser
 from .llama_tool_parser import Llama3JsonToolParser
+from .minimax_tool_parser import MinimaxToolParser
 from .mistral_tool_parser import MistralToolParser
 from .phi4mini_tool_parser import Phi4MiniJsonToolParser
 from .pythonic_tool_parser import PythonicToolParser
@@ -20,5 +21,5 @@ __all__ = [
     "GraniteToolParser", "Hermes2ProToolParser", "MistralToolParser",
     "Internlm2ToolParser", "Llama3JsonToolParser", "JambaToolParser",
     "Llama4PythonicToolParser", "PythonicToolParser", "Phi4MiniJsonToolParser",
-    "DeepSeekV3ToolParser", "xLAMToolParser"
+    "DeepSeekV3ToolParser", "xLAMToolParser", "MinimaxToolParser"
 ]
diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
new file mode 100644
index 0000000000000..6ba32e38fcde2
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
@@ -0,0 +1,369 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Sequence
+from typing import Union
+
+import partial_json_parser
+import regex as re
+from partial_json_parser.core.options import Allow
+
+from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("minimax")
+class MinimaxToolParser(ToolParser):
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        self.current_tool_name_sent: bool = False
+        self.prev_tool_call_arr: list[dict] = []
+        self.current_tool_id: int = -1
+        self.streamed_args_for_tool: list[str] = []
+
+        self.tool_call_start_token: str = "<tool_calls>"
+        self.tool_call_end_token: str = "</tool_calls>"
+
+        self.tool_call_regex = re.compile(
+            r"<tool_calls>(.*?)</tool_calls>|<tool_calls>(.*)", re.DOTALL)
+
+        # Add regex pattern for thinking tag
+        self.thinking_tag_pattern = r"<think>(.*?)</think>"
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ToolParser "
+                "constructor during construction.")
+
+        self.tool_call_start_token_id = self.vocab.get(
+            self.tool_call_start_token)
+        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
+
+        if (self.tool_call_start_token_id is None
+                or self.tool_call_end_token_id is None):
+            logger.warning(
+                "Minimax Tool parser could not locate tool call start/end "
+                "tokens in the tokenizer. Falling back to string matching.")
+
+    def preprocess_model_output(self, model_output: str) -> str:
+        """
+        Remove tool calls from within thinking tags to avoid processing them.
+        """
+
+        def remove_tool_calls_from_think(match):
+            think_content = match.group(1)
+            # Remove tool_calls from within the think tag
+            cleaned_content = re.sub(r"<tool_calls>.*?</tool_calls>",
+                                     "",
+                                     think_content,
+                                     flags=re.DOTALL)
+            return f"<think>{cleaned_content}</think>"
+
+        # Process thinking tags and remove tool_calls from within them
+        processed_output = re.sub(self.thinking_tag_pattern,
+                                  remove_tool_calls_from_think,
+                                  model_output,
+                                  flags=re.DOTALL)
+
+        return processed_output
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+
+        # Preprocess to remove tool calls from thinking tags
+        processed_output = self.preprocess_model_output(model_output)
+
+        if self.tool_call_start_token not in processed_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        try:
+            function_call_tuples = (
+                self.tool_call_regex.findall(processed_output))
+
+            raw_function_calls = []
+            for match in function_call_tuples:
+                tool_call_content = match[0] if match[0] else match[1]
+                if tool_call_content.strip():
+                    lines = tool_call_content.strip().split('\n')
+                    for line in lines:
+                        line = line.strip()
+                        if line and line.startswith('{') and line.endswith(
+                                '}'):
+                            try:
+                                parsed_call = json.loads(line)
+                                raw_function_calls.append(parsed_call)
+                            except json.JSONDecodeError:
+                                continue
+
+            tool_calls = []
+            for function_call in raw_function_calls:
+                if "name" in function_call and "arguments" in function_call:
+                    tool_calls.append(
+                        ToolCall(type="function",
+                                 function=FunctionCall(
+                                     name=function_call["name"],
+                                     arguments=json.dumps(
+                                         function_call["arguments"],
+                                         ensure_ascii=False))))
+
+            # Extract content before the first valid tool call
+            # Find the position in processed output, then map back to original
+            processed_pos = processed_output.find(self.tool_call_start_token)
+            if processed_pos != -1:
+                # Get the content before tool calls in processed output
+                processed_content = processed_output[:processed_pos].strip()
+
+                if processed_content:
+                    # Find the end of this content in the original output
+                    # Look for the last non-empty line of processed content
+                    lines = processed_content.split('\n')
+                    for line in reversed(lines):
+                        line = line.strip()
+                        if line:
+                            # Find this line in original output
+                            pos = model_output.find(line)
+                            if pos != -1:
+                                content = model_output[:pos + len(line)]
+                                break
+                    else:
+                        content = ""
+                else:
+                    content = ""
+            else:
+                content = model_output
+
+            return ExtractedToolCallInformation(
+                tools_called=len(tool_calls) > 0,
+                tool_calls=tool_calls,
+                content=content.strip() if content.strip() else None)
+
+        except Exception:
+            logger.exception(
+                "An unexpected error occurred during tool call extraction.")
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+        logger.debug("delta_text: %s", delta_text)
+        logger.debug("delta_token_ids: %s", delta_token_ids)
+
+        # Preprocess to remove tool calls from thinking tags
+        processed_current_text = self.preprocess_model_output(current_text)
+
+        if self.tool_call_start_token not in processed_current_text:
+            return DeltaMessage(content=delta_text)
+
+        if (self.tool_call_start_token_id is not None
+                and self.tool_call_start_token_id in delta_token_ids
+                and len(delta_token_ids) == 1):
+            return None
+
+        original_tool_call_start_pos = current_text.find(
+            self.tool_call_start_token)
+        if original_tool_call_start_pos > 0:
+            delta_start_pos = len(current_text) - len(delta_text)
+            if delta_start_pos < original_tool_call_start_pos:
+                content_part = delta_text
+                if delta_start_pos + len(
+                        delta_text) > original_tool_call_start_pos:
+                    content_part = delta_text[:original_tool_call_start_pos -
+                                              delta_start_pos]
+                if content_part:
+                    return DeltaMessage(content=content_part)
+
+        flags = Allow.ALL if self.current_tool_name_sent \
+            else Allow.ALL & ~Allow.STR
+
+        try:
+            parsable_content = processed_current_text.split(
+                self.tool_call_start_token)[-1].split(
+                    self.tool_call_end_token)[0]
+
+            tool_call_arr = []
+            if parsable_content.strip():
+                lines = parsable_content.strip().split('\n')
+                for line in lines:
+                    line = line.strip()
+                    if line and (line.startswith('{') or '"name"' in line):
+                        try:
+                            if line.endswith('}'):
+                                parsed_call = json.loads(line)
+                                tool_call_arr.append(parsed_call)
+                            else:
+                                parsed_call = partial_json_parser.loads(
+                                    line, flags)
+                                if parsed_call and isinstance(
+                                        parsed_call, dict):
+                                    tool_call_arr.append(parsed_call)
+                        except (json.JSONDecodeError, partial_json_parser.core.
+                                exceptions.MalformedJSON):
+                            continue
+
+            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
+                if len(tool_call_arr) > self.current_tool_id >= 0 else {}
+
+            if len(tool_call_arr) == 0:
+                return None
+
+            # Starting a new tool in the array
+            elif (len(tool_call_arr) > 0
+                  and len(tool_call_arr) > self.current_tool_id + 1):
+
+                # Handle any missed arguments from previous tool
+                if self.current_tool_id >= 0 and self.current_tool_id < len(
+                        self.prev_tool_call_arr):
+                    prev_tool_call = self.prev_tool_call_arr[
+                        self.current_tool_id]
+                    diff_arguments = prev_tool_call.get("arguments")
+
+                    if diff_arguments:
+                        diff_arguments_json = json.dumps(diff_arguments,
+                                                         ensure_ascii=False)
+                        already_streamed = self.streamed_args_for_tool[
+                            self.
+                            current_tool_id] if self.current_tool_id < len(
+                                self.streamed_args_for_tool) else ""
+
+                        if diff_arguments_json != already_streamed:
+                            diff = diff_arguments_json[len(already_streamed):]
+                            delta = DeltaMessage(tool_calls=[
+                                DeltaToolCall(index=self.current_tool_id,
+                                              function=DeltaFunctionCall(
+                                                  arguments=diff).model_dump(
+                                                      exclude_none=True))
+                            ])
+                            if self.current_tool_id < len(
+                                    self.streamed_args_for_tool):
+                                self.streamed_args_for_tool[
+                                    self.current_tool_id] = diff_arguments_json
+                        else:
+                            delta = None
+                    else:
+                        delta = None
+                else:
+                    delta = None
+
+                self.current_tool_id = len(tool_call_arr) - 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("starting on new tool %d", self.current_tool_id)
+                return delta
+
+            # Send tool name if not sent yet
+            if not self.current_tool_name_sent:
+                function_name = current_tool_call.get("name")
+                if function_name:
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      type="function",
+                                      id=random_tool_call_id(),
+                                      function=DeltaFunctionCall(
+                                          name=function_name).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.current_tool_name_sent = True
+                else:
+                    delta = None
+
+            # Stream arguments
+            else:
+                prev_arguments = None
+                if (self.current_tool_id < len(self.prev_tool_call_arr)
+                        and self.prev_tool_call_arr[self.current_tool_id]):
+                    prev_arguments = self.prev_tool_call_arr[
+                        self.current_tool_id].get("arguments")
+
+                cur_arguments = current_tool_call.get("arguments")
+
+                if not cur_arguments and not prev_arguments:
+                    delta = None
+                elif not cur_arguments and prev_arguments:
+                    logger.error(
+                        "Arguments reset mid-call, skipping streaming")
+                    delta = None
+                elif cur_arguments and not prev_arguments:
+                    cur_arguments_json = json.dumps(cur_arguments,
+                                                    ensure_ascii=False)
+                    logger.debug("First tokens in arguments received: %s",
+                                 cur_arguments_json)
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      function=DeltaFunctionCall(
+                                          arguments=cur_arguments_json).
+                                      model_dump(exclude_none=True))
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] = cur_arguments_json
+
+                elif cur_arguments and prev_arguments:
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_args_json = json.dumps(prev_arguments,
+                                                ensure_ascii=False)
+
+                    logger.debug("Searching for diff between \n%s\n%s",
+                                 cur_args_json, prev_args_json)
+
+                    already_streamed = self.streamed_args_for_tool[
+                        self.current_tool_id] if self.current_tool_id < len(
+                            self.streamed_args_for_tool) else ""
+
+                    if cur_args_json.startswith(already_streamed):
+                        argument_diff = cur_args_json[len(already_streamed):]
+                    elif cur_args_json != already_streamed:
+                        argument_diff = cur_args_json
+                        self.streamed_args_for_tool[self.current_tool_id] = ""
+                    else:
+                        argument_diff = ""
+
+                    if argument_diff:
+                        logger.debug("got arguments diff: %s", argument_diff)
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=argument_diff).
+                                          model_dump(exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += argument_diff
+                    else:
+                        delta = None
+                else:
+                    delta = None
+
+            self.prev_tool_call_arr = tool_call_arr
+            return delta
+
+        except Exception:
+            logger.exception("An unexpected error occurred",
+                             "during streaming tool call handling.")
+            return None

From 67d25eca05e7927cd0c0acd16046a3b1318b7fc3 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 3 Jul 2025 07:49:13 +0100
Subject: [PATCH 065/167] [Tests] Update online DP tests to verify that
 requests are balanced (#20157)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 .../v1/entrypoints/openai/test_completion.py  |   2 +-
 .../openai/test_multi_api_servers.py          | 126 ++++++++++++++++++
 tests/v1/test_async_llm_dp.py                 |  51 +++++--
 3 files changed, 170 insertions(+), 9 deletions(-)

diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
index a7c31c0642244..776fd42bbc35c 100644
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -38,7 +38,7 @@ def default_server_args():
                         ]])
 def server(default_server_args, request):
     if request.param:
-        default_server_args.extend(request.param)
+        default_server_args = default_server_args + request.param
     with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
         yield remote_server
 
diff --git a/tests/v1/entrypoints/openai/test_multi_api_servers.py b/tests/v1/entrypoints/openai/test_multi_api_servers.py
index ed4ecbe8484c1..e84b5e3095d06 100644
--- a/tests/v1/entrypoints/openai/test_multi_api_servers.py
+++ b/tests/v1/entrypoints/openai/test_multi_api_servers.py
@@ -2,10 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import os
+import re
 
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
+import requests
 
 from tests.utils import RemoteOpenAIServer
 
@@ -14,6 +16,122 @@ MODEL_NAME = "ibm-research/PowerMoE-3b"
 DP_SIZE = os.getenv("DP_SIZE", "1")
 
 
+def get_prometheus_metrics(
+        server: RemoteOpenAIServer) -> dict[str, dict[str, float]]:
+    """Fetch and parse Prometheus metrics from the /metrics endpoint.
+    
+    Returns:
+        Dict mapping metric names to their values grouped by labels.
+        For example: {"vllm:request_success": {
+            "engine=0": 5.0, "engine=1": 3.0}
+        }
+    """
+    try:
+        response = requests.get(server.url_for("metrics"), timeout=10)
+        response.raise_for_status()
+
+        metrics: dict[str, dict[str, float]] = {}
+
+        # Regex patterns for Prometheus metrics
+        metric_with_labels = re.compile(
+            r'^([a-zA-Z_:][a-zA-Z0-9_:]*)\{([^}]*)\}\s+([\d\.\-\+e]+)$')
+        metric_simple = re.compile(
+            r'^([a-zA-Z_:][a-zA-Z0-9_:]*)\s+([\d\.\-\+e]+)$')
+
+        for line in response.text.split('\n'):
+            line = line.strip()
+            # Skip comments and empty lines
+            if not line or line.startswith('#'):
+                continue
+
+            # Try to match metric with labels first
+            match = metric_with_labels.match(line)
+            if match:
+                metric_name, labels_part, value_str = match.groups()
+                try:
+                    value = float(value_str)
+                    if metric_name not in metrics:
+                        metrics[metric_name] = {}
+                    metrics[metric_name][f'{{{labels_part}}}'] = value
+                except ValueError:
+                    continue
+            else:
+                # Try simple metric without labels
+                match = metric_simple.match(line)
+                if match:
+                    metric_name, value_str = match.groups()
+                    try:
+                        value = float(value_str)
+                        if metric_name not in metrics:
+                            metrics[metric_name] = {}
+                        metrics[metric_name][''] = value
+                    except ValueError:
+                        continue
+
+        return metrics
+    except Exception as e:
+        pytest.fail(f"Failed to fetch Prometheus metrics: {e}")
+        return {}
+
+
+def get_engine_request_counts(
+        metrics: dict[str, dict[str, float]]) -> dict[str, float]:
+    """Extract request counts per engine from Prometheus metrics.
+    
+    Returns:
+        Dict mapping engine indices to request counts.
+        For example: {"0": 15.0, "1": 12.0}
+    """
+    engine_counts = {}
+
+    # Look for request success metrics with engine labels
+    success_metrics = metrics.get("vllm:request_success_total", {})
+    engine_pattern = re.compile(r'engine="([^"]*)"')
+
+    for labels, count in success_metrics.items():
+        # Extract engine ID from labels using regex
+        match = engine_pattern.search(labels)
+        if match:
+            engine_id = match.group(1)
+            if engine_id not in engine_counts:
+                engine_counts[engine_id] = 0.0
+            engine_counts[engine_id] += count
+
+    return engine_counts
+
+
+def check_request_balancing(server: RemoteOpenAIServer):
+    """Check request balancing via Prometheus metrics if DP_SIZE > 1.
+    
+    Args:
+        server: The RemoteOpenAIServer instance
+    """
+    dp_size = int(DP_SIZE)
+    if dp_size <= 1:
+        return
+
+    # Get metrics after all requests are completed
+    metrics = get_prometheus_metrics(server)
+    engine_counts = get_engine_request_counts(metrics)
+
+    # Check that multiple engines received requests
+    engines_with_requests = [
+        engine for engine, count in engine_counts.items() if count > 0
+    ]
+    assert len(engines_with_requests) == dp_size, (
+        f"Expected requests to be distributed across multiple engines,"
+        f" but only engine(s) {engines_with_requests} received "
+        f"requests. Engine counts: {engine_counts}")
+
+    # Verify that the load is reasonably balanced
+    # (no engine should handle all requests)
+    total_requests = sum(engine_counts.values())
+
+    for count in engine_counts.values():
+        assert count > total_requests // (dp_size + 1), (
+            f"requests are imbalanced: {engine_counts}")
+
+
 @pytest.fixture(scope="module")
 def default_server_args():
     return [
@@ -50,6 +168,7 @@ async def client(server):
     [MODEL_NAME],
 )
 async def test_single_completion(client: openai.AsyncOpenAI,
+                                 server: RemoteOpenAIServer,
                                  model_name: str) -> None:
 
     async def make_request():
@@ -97,6 +216,9 @@ async def test_single_completion(client: openai.AsyncOpenAI,
     assert len(results) == num_requests
     assert all(completion is not None for completion in results)
 
+    # Check request balancing via Prometheus metrics if DP_SIZE > 1
+    check_request_balancing(server)
+
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
@@ -104,6 +226,7 @@ async def test_single_completion(client: openai.AsyncOpenAI,
     [MODEL_NAME],
 )
 async def test_completion_streaming(client: openai.AsyncOpenAI,
+                                    server: RemoteOpenAIServer,
                                     model_name: str) -> None:
     prompt = "What is an LLM?"
 
@@ -170,3 +293,6 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
         results
     ) == num_requests, f"Expected {num_requests} results, got {len(results)}"
     assert all(results), "Not all streaming requests completed successfully."
+
+    # Check request balancing via Prometheus metrics if DP_SIZE > 1
+    check_request_balancing(server)
diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py
index 075ceb257ab70..64a41bec37919 100644
--- a/tests/v1/test_async_llm_dp.py
+++ b/tests/v1/test_async_llm_dp.py
@@ -4,24 +4,30 @@
 import asyncio
 import os
 from contextlib import ExitStack
+from dataclasses import dataclass
 from typing import Optional
 
 import pytest
 
 from vllm import SamplingParams
+from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.inputs import PromptType
 from vllm.platforms import current_platform
 from vllm.sampling_params import RequestOutputKind
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.core_client import DPAsyncMPClient
+from vllm.v1.metrics.loggers import StatLoggerBase
+from vllm.v1.metrics.stats import IterationStats, SchedulerStats
+
+DP_SIZE = int(os.getenv("DP_SIZE", 2))
 
 engine_args = AsyncEngineArgs(
     model="ibm-research/PowerMoE-3b",
     enforce_eager=True,
     disable_log_requests=True,
     tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
-    data_parallel_size=int(os.getenv("DP_SIZE", 2)),
+    data_parallel_size=DP_SIZE,
 )
 
 if not current_platform.supports_v1(engine_args.create_model_config()):
@@ -74,12 +80,32 @@ async def generate(
 async def test_load(output_kind: RequestOutputKind,
                     data_parallel_backend: str):
 
+    stats_loggers = {}
+
+    @dataclass
+    class SimpleStatsLogger(StatLoggerBase):
+        init_count: int = 0
+        finished_req_count: int = 0
+
+        def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
+            stats_loggers[engine_index] = self
+
+        def record(self, scheduler_stats: Optional[SchedulerStats],
+                   iteration_stats: Optional[IterationStats]):
+            if iteration_stats:
+                self.finished_req_count += len(
+                    iteration_stats.finished_requests)
+
+        def log_engine_initialized(self):
+            self.init_count += 1
+
     with ExitStack() as after:
 
         prompt = "This is a test of data parallel"
 
         engine_args.data_parallel_backend = data_parallel_backend
-        engine = AsyncLLM.from_engine_args(engine_args)
+        engine = AsyncLLM.from_engine_args(engine_args,
+                                           stat_loggers=[SimpleStatsLogger])
         after.callback(engine.shutdown)
 
         NUM_REQUESTS = 100
@@ -92,12 +118,10 @@ async def test_load(output_kind: RequestOutputKind,
         for request_id in request_ids:
             tasks.append(
                 asyncio.create_task(
-                    generate(engine,
-                             request_id,
-                             prompt,
-                             output_kind,
-                             NUM_EXPECTED_TOKENS,
-                             data_parallel_rank=0)))
+                    generate(engine, request_id, prompt, output_kind,
+                             NUM_EXPECTED_TOKENS)))
+            # Short sleep to ensure that requests are distributed.
+            await asyncio.sleep(0.01)
         # Confirm that we got all the EXPECTED tokens from the requests.
         done, pending = await asyncio.wait(tasks,
                                            return_when=asyncio.FIRST_EXCEPTION)
@@ -122,3 +146,14 @@ async def test_load(output_kind: RequestOutputKind,
 
         assert not core_client.engines_running
         assert not core_client.reqs_in_flight
+
+        # Check that requests were distributed between the engines
+        print(f"Stats loggers after test: {stats_loggers}")
+        assert len(stats_loggers) == DP_SIZE
+        assert stats_loggers[0].init_count == 1
+
+        for sl in stats_loggers.values():
+            slogger: SimpleStatsLogger = sl
+
+            assert slogger.finished_req_count > NUM_REQUESTS // (
+                DP_SIZE + 1), f"requests are imbalanced: {stats_loggers}"

From 220aee902a291209f2975d4cd02dadcc6749ffe6 Mon Sep 17 00:00:00 2001
From: Lifans <lifans@meta.com>
Date: Wed, 2 Jul 2025 23:56:49 -0700
Subject: [PATCH 066/167] [Misc] Add rules to label Speculative Decoding
 Related PRs (#20406)

Signed-off-by: Lifan Shen <lifans@meta.com>
---
 .github/mergify.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index af5553ab2c0b2..6fb28a86fd8d4 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -156,8 +156,14 @@ pull_request_rules:
   conditions:
     - or:
       - files~=^vllm/spec_decode/
+      - files~=^vllm/v1/spec_decode/
       - files=vllm/model_executor/layers/spec_decode_base_sampler.py
       - files~=^tests/spec_decode/
+      - files~=^tests/v1/spec_decode/
+      - files~=^examples/.*(spec_decode|mlpspeculator|eagle|speculation).*\.py
+      - files~=^vllm/model_executor/models/.*eagle.*\.py
+      - files=vllm/model_executor/models/mlp_speculator.py
+      - files~=^vllm/transformers_utils/configs/(eagle|medusa|mlp_speculator)\.py
   actions:
     label:
       add:

From 359200f6ac9919140b995d5aa906854ba16e4870 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Thu, 3 Jul 2025 15:21:57 +0800
Subject: [PATCH 067/167] [doc] fix link (#20417)

Signed-off-by: reidliu41 <reid201711@gmail.com>
---
 examples/offline_inference/profiling_tpu/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/offline_inference/profiling_tpu/README.md b/examples/offline_inference/profiling_tpu/README.md
index 6595efec43779..e0122c05cff1f 100644
--- a/examples/offline_inference/profiling_tpu/README.md
+++ b/examples/offline_inference/profiling_tpu/README.md
@@ -4,7 +4,7 @@ This script is used to profile the TPU performance of vLLM for specific prefill
 
 Note: an actual running server is a mix of both prefill of many shapes and decode of many shapes.
 
-We assume you are on a TPU already (this was tested on TPU v6e) and have installed vLLM according to the [installation guide](https://docs.vllm.ai/en/latest/getting_started/installation/ai_accelerator/index.html).
+We assume you are on a TPU already (this was tested on TPU v6e) and have installed vLLM according to the [Google TPU installation guide](https://docs.vllm.ai/en/latest/getting_started/installation/google_tpu.html).
 
 > In all examples below, we run several warmups before (so `--enforce-eager` is okay)
 

From cb97f2bfc54f0aff5a19d6bbdfdde1d9becbe738 Mon Sep 17 00:00:00 2001
From: Michael Yao <haifeng.yao@daocloud.io>
Date: Thu, 3 Jul 2025 15:48:25 +0800
Subject: [PATCH 068/167] [Docs] Replace two list with tables in intel_gaudi.md
 (#20414)

Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
---
 .../installation/intel_gaudi.md               | 43 +++++++++++--------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/docs/getting_started/installation/intel_gaudi.md b/docs/getting_started/installation/intel_gaudi.md
index d1d544c8359fc..7a7a5a51c24cb 100644
--- a/docs/getting_started/installation/intel_gaudi.md
+++ b/docs/getting_started/installation/intel_gaudi.md
@@ -198,7 +198,12 @@ INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, ma
 INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
 ```
 
-`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling -- `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
+| Parameter      | Description                                                                 |
+|----------------|-----------------------------------------------------------------------------|
+| `min`          | Determines the lowest value of the bucket.                                  |
+| `step`         | Determines the interval between buckets.                                     |
+| `max`          | Determines the upper bound of the bucket.                                    |
+| Ramp-up phase  | A special handling phase applied between `min` and `step`:<br/>- `min` is multiplied by consecutive powers of two until `step` is reached.<br/>- Minimizes resource wastage for small batch sizes.<br/>- Allows larger padding for larger batches. |
 
 Example (with ramp-up):
 
@@ -349,28 +354,28 @@ Each described step is logged by vLLM server, as follows (negative values corres
 
 - `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism
 
-  * `{phase}` is either `PROMPT` or `DECODE`
+    * `{phase}` is either `PROMPT` or `DECODE`
 
-  * `{dim}` is either `BS`, `SEQ` or `BLOCK`
+    * `{dim}` is either `BS`, `SEQ` or `BLOCK`
 
-  * `{param}` is either `MIN`, `STEP` or `MAX`
+    * `{param}` is either `MIN`, `STEP` or `MAX`
 
-  * Default values:
+    * Default values:
 
-    - Prompt:
-      - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1`
-      - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
-      - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)`
-      - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size`
-      - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size`
-      - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len`
-    - Decode:
-      - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1`
-      - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
-      - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs`
-      - sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size`
-      - sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size`
-      - sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)`
+| `{phase}` | Parameter | Env Variable | Value Expression |
+|-----------|-----------|--------------|------------------|
+| Prompt | Batch size min | `VLLM_PROMPT_BS_BUCKET_MIN` | `1` |
+| Prompt | Batch size step | `VLLM_PROMPT_BS_BUCKET_STEP` | `min(max_num_seqs, 32)` |
+| Prompt | Batch size max | `VLLM_PROMPT_BS_BUCKET_MAX` | `min(max_num_seqs, 64)` |
+| Prompt | Sequence length min | `VLLM_PROMPT_SEQ_BUCKET_MIN` | `block_size` |
+| Prompt | Sequence length step | `VLLM_PROMPT_SEQ_BUCKET_STEP` | `block_size` |
+| Prompt | Sequence length max | `VLLM_PROMPT_SEQ_BUCKET_MAX` | `max_model_len` |
+| Decode | Batch size min | `VLLM_DECODE_BS_BUCKET_MIN` | `1` |
+| Decode | Batch size step | `VLLM_DECODE_BS_BUCKET_STEP` | `min(max_num_seqs, 32)` |
+| Decode | Batch size max | `VLLM_DECODE_BS_BUCKET_MAX` | `max_num_seqs` |
+| Decode | Sequence length min | `VLLM_DECODE_BLOCK_BUCKET_MIN` | `block_size` |
+| Decode | Sequence length step | `VLLM_DECODE_BLOCK_BUCKET_STEP` | `block_size` |
+| Decode | Sequence length max | `VLLM_DECODE_BLOCK_BUCKET_MAX` | `max(128, (max_num_seqs*max_model_len)/block_size)` |
 
 Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:
 

From b024a42e93d1f078816302579b577fb24b5939a4 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 3 Jul 2025 16:18:30 +0800
Subject: [PATCH 069/167] [Core] Move multimodal placeholder from chat utils to
 model definition (#20355)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/contributing/model/multimodal.md         | 16 ++++
 tests/async_engine/test_async_llm_engine.py   |  1 -
 tests/engine/test_arg_utils.py                | 20 ----
 tests/entrypoints/openai/test_serving_chat.py |  1 -
 vllm/config.py                                |  8 +-
 vllm/engine/arg_utils.py                      |  6 --
 vllm/entrypoints/chat_utils.py                | 94 ++-----------------
 vllm/entrypoints/openai/speech_to_text.py     | 18 ++--
 vllm/model_executor/model_loader/__init__.py  |  3 +-
 .../model_executor/model_loader/tensorizer.py |  8 +-
 vllm/model_executor/model_loader/utils.py     |  4 +
 vllm/model_executor/models/__init__.py        |  8 +-
 vllm/model_executor/models/aria.py            |  7 ++
 vllm/model_executor/models/aya_vision.py      |  7 ++
 vllm/model_executor/models/blip2.py           |  7 ++
 vllm/model_executor/models/chameleon.py       |  7 ++
 vllm/model_executor/models/deepseek_vl2.py    |  7 ++
 vllm/model_executor/models/florence2.py       |  7 ++
 vllm/model_executor/models/fuyu.py            |  7 ++
 vllm/model_executor/models/gemma3_mm.py       |  7 ++
 vllm/model_executor/models/glm4_1v.py         |  9 ++
 vllm/model_executor/models/glm4v.py           |  7 ++
 vllm/model_executor/models/granite_speech.py  |  7 ++
 vllm/model_executor/models/idefics3.py        |  7 ++
 vllm/model_executor/models/interfaces.py      |  7 ++
 vllm/model_executor/models/internvl.py        |  9 ++
 vllm/model_executor/models/keye.py            |  9 ++
 vllm/model_executor/models/kimi_vl.py         |  7 ++
 vllm/model_executor/models/llava.py           |  7 ++
 vllm/model_executor/models/llava_next.py      |  7 ++
 .../model_executor/models/llava_next_video.py |  9 ++
 vllm/model_executor/models/llava_onevision.py |  9 ++
 vllm/model_executor/models/minicpmo.py        | 11 +++
 vllm/model_executor/models/minicpmv.py        |  9 ++
 vllm/model_executor/models/minimax_vl_01.py   |  7 ++
 vllm/model_executor/models/mistral3.py        |  7 ++
 vllm/model_executor/models/mllama.py          |  7 ++
 vllm/model_executor/models/mllama4.py         |  7 ++
 vllm/model_executor/models/molmo.py           |  7 ++
 vllm/model_executor/models/ovis.py            |  7 ++
 vllm/model_executor/models/paligemma.py       |  7 ++
 vllm/model_executor/models/phi3v.py           |  7 ++
 vllm/model_executor/models/phi4mm.py          |  9 ++
 vllm/model_executor/models/pixtral.py         |  7 ++
 .../models/prithvi_geospatial_mae.py          |  7 ++
 .../models/qwen2_5_omni_thinker.py            | 11 +++
 vllm/model_executor/models/qwen2_5_vl.py      |  9 ++
 vllm/model_executor/models/qwen2_audio.py     |  7 ++
 vllm/model_executor/models/qwen2_vl.py        |  9 ++
 vllm/model_executor/models/qwen_vl.py         |  7 ++
 vllm/model_executor/models/skyworkr1v.py      |  7 ++
 vllm/model_executor/models/tarsier.py         |  7 ++
 vllm/model_executor/models/ultravox.py        |  7 ++
 vllm/model_executor/models/whisper.py         | 51 +++++-----
 54 files changed, 396 insertions(+), 155 deletions(-)

diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md
index 670d747b9ee7d..ed1cd46dd8585 100644
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@@ -10,6 +10,22 @@ This document walks you through the steps to extend a basic model so that it acc
 It is assumed that you have already implemented the model in vLLM according to [these steps][new-model-basic].
 Further update the model as follows:
 
+- Implement [get_placeholder_str][vllm.model_executor.models.interfaces.SupportsMultiModal.get_placeholder_str] to define the placeholder string which is used to represent the multi-modal item in the text prompt. This should be consistent with the chat template of the model.
+
+    ??? Code
+
+        ```python
+        class YourModelForImage2Seq(nn.Module):
+            ...
+
+            @classmethod
+            def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+                if modality.startswith("image"):
+                    return "<image>"
+
+                raise ValueError("Only image modality is supported")
+        ```
+
 - Reserve a keyword parameter in [forward][torch.nn.Module.forward] for each input tensor that corresponds to a multi-modal input, as shown in the following example:
 
   ```diff
diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
index 3c030aea20661..0eb7a6eb52aa6 100644
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -33,7 +33,6 @@ class RequestOutput:
 class MockModelConfig:
     use_async_output_proc = True
     media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
-    mm_placeholder_str_override: dict[str, str] = field(default_factory=dict)
 
 
 class MockEngine:
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 8e0579b7cfb84..86e28c6878476 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -263,26 +263,6 @@ def test_media_io_kwargs_parser(arg, expected):
     assert args.media_io_kwargs == expected
 
 
-@pytest.mark.parametrize(("arg", "expected"), [
-    (None, dict()),
-    ('{"video":"<|video_placeholder|>"}', {
-        "video": "<|video_placeholder|>"
-    }),
-    ('{"video":"<|video_placeholder|>", "image": "<|image_placeholder|>"}', {
-        "video": "<|video_placeholder|>",
-        "image": "<|image_placeholder|>"
-    }),
-])
-def test_mm_placeholder_str_override_parser(arg, expected):
-    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
-    if arg is None:
-        args = parser.parse_args([])
-    else:
-        args = parser.parse_args(["--mm-placeholder-str-override", arg])
-
-    assert args.mm_placeholder_str_override == expected
-
-
 def test_compilation_config():
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
 
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index e339351189218..ad80946b5671f 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -41,7 +41,6 @@ class MockModelConfig:
     encoder_config = None
     generation_config: str = "auto"
     media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
-    mm_placeholder_str_override: dict[str, str] = field(default_factory=dict)
 
     def get_diff_sampling_param(self):
         return self.diff_sampling_param or {}
diff --git a/vllm/config.py b/vllm/config.py
index 0ec4281ae3c10..5c19061a0d514 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -350,8 +350,6 @@ class ModelConfig:
     """Additional args passed to process media inputs, keyed by modalities. 
     For example, to set num_frames for video, set 
     `--media-io-kwargs '{"video": {"num_frames": 40} }'` """
-    mm_placeholder_str_override: dict[str, str] = field(default_factory=dict)
-    """Optionally override placeholder string for given modalities."""
     use_async_output_proc: bool = True
     """Whether to use async output processor."""
     config_format: Union[str, ConfigFormat] = ConfigFormat.AUTO.value
@@ -661,7 +659,7 @@ class ModelConfig:
         return self._architecture
 
     @property
-    def model_info(self) -> dict[str, Any]:
+    def model_info(self):
         return self._model_info
 
     def maybe_pull_model_tokenizer_for_s3(self, model: str,
@@ -701,7 +699,6 @@ class ModelConfig:
             return MultiModalConfig(
                 limit_per_prompt=self.limit_mm_per_prompt,
                 media_io_kwargs=self.media_io_kwargs,
-                mm_placeholder_str_override=self.mm_placeholder_str_override,
                 mm_processor_kwargs=self.mm_processor_kwargs,
                 disable_mm_preprocessor_cache=self.
                 disable_mm_preprocessor_cache)
@@ -3096,9 +3093,6 @@ class MultiModalConfig:
     For example, to set num_frames for video, set 
     `--media-io-kwargs '{"video": {"num_frames": 40} }'` """
 
-    mm_placeholder_str_override: dict[str, str] = field(default_factory=dict)
-    """Optionally override placeholder string for given modalities."""
-
     mm_processor_kwargs: Optional[dict[str, object]] = None
     """
     Overrides for the multi-modal processor obtained from
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 11dbc23a743b2..284f092361311 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -373,8 +373,6 @@ class EngineArgs:
     media_io_kwargs: dict[str, dict[str,
                                     Any]] = get_field(MultiModalConfig,
                                                       "media_io_kwargs")
-    mm_placeholder_str_override: dict[str, str] = \
-        get_field(MultiModalConfig, "mm_placeholder_str_override")
     mm_processor_kwargs: Optional[Dict[str, Any]] = \
         MultiModalConfig.mm_processor_kwargs
     disable_mm_preprocessor_cache: bool = \
@@ -759,9 +757,6 @@ class EngineArgs:
                                       **multimodal_kwargs["limit_per_prompt"])
         multimodal_group.add_argument("--media-io-kwargs",
                                       **multimodal_kwargs["media_io_kwargs"])
-        multimodal_group.add_argument(
-            "--mm-placeholder-str-override",
-            **multimodal_kwargs["mm_placeholder_str_override"])
         multimodal_group.add_argument(
             "--mm-processor-kwargs",
             **multimodal_kwargs["mm_processor_kwargs"])
@@ -987,7 +982,6 @@ class EngineArgs:
             served_model_name=self.served_model_name,
             limit_mm_per_prompt=self.limit_mm_per_prompt,
             media_io_kwargs=self.media_io_kwargs,
-            mm_placeholder_str_override=self.mm_placeholder_str_override,
             use_async_output_proc=not self.disable_async_output_proc,
             config_format=self.config_format,
             mm_processor_kwargs=self.mm_processor_kwargs,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index bbf6518913391..1054b969cd3b8 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -6,7 +6,7 @@ import json
 from abc import ABC, abstractmethod
 from collections import defaultdict, deque
 from collections.abc import Awaitable, Iterable
-from functools import cache, lru_cache, partial
+from functools import cached_property, lru_cache, partial
 from pathlib import Path
 from typing import (Any, Callable, Generic, Literal, Optional, TypeVar, Union,
                     cast)
@@ -37,6 +37,8 @@ from typing_extensions import Required, TypeAlias, TypedDict
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
+from vllm.model_executor.model_loader import get_model_cls
+from vllm.model_executor.models import SupportsMultiModal
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
 from vllm.multimodal.utils import MediaConnector
 # yapf: disable
@@ -492,6 +494,10 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
     def model_config(self) -> ModelConfig:
         return self._model_config
 
+    @cached_property
+    def model_cls(self):
+        return get_model_cls(self.model_config)
+
     @property
     def allowed_local_media_path(self):
         return self._model_config.allowed_local_media_path
@@ -500,89 +506,6 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
     def mm_registry(self):
         return MULTIMODAL_REGISTRY
 
-    @staticmethod
-    @cache
-    def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str:
-        return tokenizer.decode(token_index)
-
-    def _placeholder_str(self, modality: ModalityStr,
-                         current_count: int) -> Optional[str]:
-        if modality in self._model_config.mm_placeholder_str_override:
-            return self._model_config.mm_placeholder_str_override[modality]
-
-        # TODO: Let user specify how to insert image tokens into prompt
-        # (similar to chat template)
-        hf_config = self._model_config.hf_config
-        model_type = hf_config.model_type
-
-        if modality in ("image", "image_embeds"):
-            if model_type == "chatglm":
-                return "<|begin_of_image|><|endoftext|><|end_of_image|>"
-            if model_type == "glm4v":
-                return "<|begin_of_image|><|image|><|end_of_image|>"
-            if model_type in ("phi3_v", "phi4mm"):
-                return f"<|image_{current_count}|>"
-            if model_type in ("minicpmo", "minicpmv"):
-                return "(<image>./</image>)"
-            if model_type in ("blip-2", "florence2", "fuyu", "paligemma",
-                              "pixtral", "mistral3"):
-                # These models do not use image tokens in the prompt
-                return None
-            if model_type == "qwen":
-                return f"Picture {current_count}: <img></img>"
-            if model_type.startswith("llava"):
-                return self._cached_token_str(self._tokenizer,
-                                              hf_config.image_token_index)
-
-            if model_type in ("aya_vision", "chameleon", "deepseek_vl_v2",
-                              "internvl_chat", "ovis", "skywork_chat",
-                              "NVLM_D", "h2ovl_chat", "idefics3", "smolvlm"):
-                return "<image>"
-            if model_type in ("mllama", "llama4"):
-                return "<|image|>"
-            if model_type in ("qwen2_vl", "qwen2_5_vl", "keye", "Keye"):
-                return "<|vision_start|><|image_pad|><|vision_end|>"
-            if model_type == "qwen2_5_omni":
-                return "<|vision_start|><|IMAGE|><|vision_end|>"
-            if model_type == "molmo":
-                return ""
-            if model_type == "aria":
-                return "<|fim_prefix|><|img|><|fim_suffix|>"
-            if model_type == "gemma3":
-                return "<start_of_image>"
-            if model_type == "kimi_vl":
-                return "<|media_start|>image<|media_content|><|media_pad|><|media_end|>" # noqa: E501
-
-            raise TypeError(f"Unknown {modality} model type: {model_type}")
-        elif modality == "audio":
-            if model_type in ("ultravox", "granite_speech"):
-                return "<|audio|>"
-            if model_type == "phi4mm":
-                return f"<|audio_{current_count}|>"
-            if model_type in ("qwen2_audio", "qwen2_5_omni"):
-                return (f"Audio {current_count}: "
-                        f"<|audio_bos|><|AUDIO|><|audio_eos|>")
-            if model_type == "minicpmo":
-                return "(<audio>./</audio>)"
-            raise TypeError(f"Unknown model type: {model_type}")
-        elif modality == "video":
-            if model_type == "internvl_chat":
-                return "<video>"
-            if model_type == "glm4v":
-                return "<|begin_of_video|><|video|><|end_of_video|>"
-            if model_type in ("qwen2_vl", "qwen2_5_vl", "keye", "Keye"):
-                return "<|vision_start|><|video_pad|><|vision_end|>"
-            if model_type == "qwen2_5_omni":
-                return "<|vision_start|><|VIDEO|><|vision_end|>"
-            if model_type in ("minicpmo", "minicpmv"):
-                return "(<video>./</video>)"
-            if model_type.startswith("llava"):
-                return self._cached_token_str(self._tokenizer,
-                                              hf_config.video_token_index)
-            raise TypeError(f"Unknown {modality} model type: {model_type}")
-        else:
-            raise TypeError(f"Unknown modality: {modality}")
-
     def add(self, modality: ModalityStr, item: _T) -> Optional[str]:
         """
         Add a multi-modal item to the current prompt and returns the
@@ -590,6 +513,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
         """
         mm_registry = self.mm_registry
         model_config = self.model_config
+        model_cls = cast(SupportsMultiModal, self.model_cls)
 
         input_modality = modality.replace("_embeds", "")
 
@@ -614,7 +538,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
 
         self._items_by_modality[modality].append(item)
 
-        return self._placeholder_str(modality, current_count)
+        return model_cls.get_placeholder_str(modality, current_count)
 
     @abstractmethod
     def create_parser(self) -> "BaseMultiModalContentParser":
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index 6c16e53245314..0ab029e5305bd 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -5,6 +5,7 @@ import io
 import math
 import time
 from collections.abc import AsyncGenerator
+from functools import cached_property
 from math import ceil
 from typing import Callable, Literal, Optional, TypeVar, Union, cast
 
@@ -24,7 +25,8 @@ from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
-from vllm.model_executor.model_loader.utils import get_model_architecture
+from vllm.model_executor.model_loader import get_model_cls
+from vllm.model_executor.models import SupportsTranscription
 from vllm.outputs import RequestOutput
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.utils import PlaceholderModule
@@ -76,24 +78,29 @@ class OpenAISpeechToText(OpenAIServing):
         self.model_sr = processor.feature_extractor.sampling_rate
         self.hop_length = processor.feature_extractor.hop_length
         self.task_type = task_type
-        self.model_cls, _ = get_model_architecture(model_config)
 
         if self.default_sampling_params:
             logger.info(
                 "Overwriting default completion sampling param with: %s",
                 self.default_sampling_params)
 
+    @cached_property
+    def model_cls(self):
+        return get_model_cls(self.model_config)
+
     async def _preprocess_speech_to_text(
         self,
         request: SpeechToTextRequest,
         audio_data: bytes,
     ) -> tuple[list[PromptType], float]:
+        model_cls = cast(SupportsTranscription, self.model_cls)
+
         # Validate request
         # TODO language should be optional and can be guessed.
         # For now we default to en. See
         # https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/generation_whisper.py#L1520
         lang = request.language or "en"
-        self.model_cls.validate_language(lang)  # type: ignore[attr-defined]
+        model_cls.validate_language(lang)
 
         if len(audio_data) / 1024**2 > MAX_AUDIO_CLIP_FILESIZE_MB:
             raise ValueError("Maximum file size exceeded.")
@@ -117,9 +124,8 @@ class OpenAISpeechToText(OpenAIServing):
                     },
                 },
                 "decoder_prompt":
-                self.model_cls.
-                get_decoder_prompt(  # type: ignore[attr-defined]
-                    lang, self.task_type, request.prompt)
+                model_cls.get_decoder_prompt(lang, self.task_type,
+                                             request.prompt)
             }
             prompts.append(cast(PromptType, prompt))
         return prompts, duration
diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py
index f364371033f53..78681a0463710 100644
--- a/vllm/model_executor/model_loader/__init__.py
+++ b/vllm/model_executor/model_loader/__init__.py
@@ -18,7 +18,7 @@ from vllm.model_executor.model_loader.sharded_state_loader import (
     ShardedStateLoader)
 from vllm.model_executor.model_loader.tensorizer_loader import TensorizerLoader
 from vllm.model_executor.model_loader.utils import (
-    get_architecture_class_name, get_model_architecture)
+    get_architecture_class_name, get_model_architecture, get_model_cls)
 
 
 def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
@@ -65,6 +65,7 @@ __all__ = [
     "get_model_loader",
     "get_architecture_class_name",
     "get_model_architecture",
+    "get_model_cls",
     "BaseModelLoader",
     "BitsAndBytesModelLoader",
     "GGUFModelLoader",
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 24d1e136539a7..1c14d55fc0fe7 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -13,7 +13,7 @@ import time
 from collections.abc import Generator
 from dataclasses import dataclass
 from functools import partial
-from typing import Any, BinaryIO, Optional, Union
+from typing import TYPE_CHECKING, Any, BinaryIO, Optional, Union
 
 import regex as re
 import torch
@@ -24,12 +24,14 @@ from transformers import PretrainedConfig
 import vllm.envs as envs
 from vllm.config import (ModelConfig, ParallelConfig, VllmConfig,
                          set_current_vllm_config)
-from vllm.engine.arg_utils import EngineArgs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.utils import FlexibleArgumentParser, PlaceholderModule
 
+if TYPE_CHECKING:
+    from vllm.engine.arg_utils import EngineArgs
+
 try:
     from tensorizer import (DecryptionParams, EncryptionParams,
                             TensorDeserializer, TensorSerializer)
@@ -503,7 +505,7 @@ def serialize_vllm_model(
     return model
 
 
-def tensorize_vllm_model(engine_args: EngineArgs,
+def tensorize_vllm_model(engine_args: "EngineArgs",
                          tensorizer_config: TensorizerConfig,
                          generate_keyfile: bool = True):
     """Utility to load a model and then serialize it with Tensorizer
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 159e7b1e6b01a..ff596054a698a 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -253,6 +253,10 @@ def get_model_architecture(
     return model_cls, arch
 
 
+def get_model_cls(model_config: ModelConfig) -> type[nn.Module]:
+    return get_model_architecture(model_config)[0]
+
+
 def get_architecture_class_name(model_config: ModelConfig) -> str:
     return get_model_architecture(model_config)[1]
 
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 27c169d2d1e81..d3ee6872dd8bf 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -2,9 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal,
-                         SupportsPP, SupportsV0Only, has_inner_state,
-                         supports_lora, supports_multimodal, supports_pp,
-                         supports_v0_only)
+                         SupportsPP, SupportsTranscription, SupportsV0Only,
+                         has_inner_state, supports_lora, supports_multimodal,
+                         supports_pp, supports_transcription, supports_v0_only)
 from .interfaces_base import (VllmModelForPooling, VllmModelForTextGeneration,
                               is_pooling_model, is_text_generation_model)
 from .registry import ModelRegistry
@@ -23,6 +23,8 @@ __all__ = [
     "supports_multimodal",
     "SupportsPP",
     "supports_pp",
+    "SupportsTranscription",
+    "supports_transcription",
     "SupportsV0Only",
     "supports_v0_only",
 ]
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index eb7435d6e1d8b..8ae1680a71f3c 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -499,6 +499,13 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
         },
     )
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|fim_prefix|><|img|><|fim_suffix|>"
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(
         self,
         vllm_config: VllmConfig,
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index 38daf995b8ca3..5214c248a40e9 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -304,6 +304,13 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
             "lm_head.": "language_model.lm_head.",
         })
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config: AyaVisionConfig = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index ecc12fa8d3727..27a9208107871 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -507,6 +507,13 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
 class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
                                     SupportsQuant):
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return None
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         super().__init__()
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 06e33ad7737e3..74b18df7214b4 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -933,6 +933,13 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
         "gate_up_proj": ["gate_proj", "up_proj"]
     }
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index cdda9fb5a7490..a9654f5f46023 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -315,6 +315,13 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         "language.": "language_model.",
     })
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config: DeepseekVLV2Config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index bda552721eb23..1bedac29af9f3 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -877,6 +877,13 @@ class Florence2MultiModalProcessor(
 class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsV0Only):
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return None
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index b3e055b966b08..26c8f80d5a0cb 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -254,6 +254,13 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
             "lm_head.": "language_model.lm_head.",
         })
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return None
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index e9c27674b8457..d14f5fa3d608b 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -483,6 +483,13 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
             "lm_head.": "language_model.lm_head.",
         })
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<start_of_image>"
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 3ac3341726dbe..303cbdb25945b 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -1257,6 +1257,15 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
             "model.visual.": "visual.",
         })
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|begin_of_image|><|image|><|end_of_image|>"
+        if modality.startswith("video"):
+            return "<|begin_of_video|><|video|><|end_of_video|>"
+
+        raise ValueError("Only image or video modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config: Glm4vConfig = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 95e3fcfc02fab..7584b5188cf2a 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -540,6 +540,13 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
             connector="transformer.vision.linear_proj",
             tower_model="transformer.vision.transformer")
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|begin_of_image|><|endoftext|><|end_of_image|>"
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(
         self,
         *,
diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
index 77fbc4808b4a3..6c7c9f5cc9388 100644
--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -533,6 +533,13 @@ class GraniteSpeechForConditionalGeneration(
         ],
     }
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("audio"):
+            return "<|audio|>"
+
+        raise ValueError("Only audio modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 36cfb5807d7d6..4643468af4ce1 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -591,6 +591,13 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
         ],
     }
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index d234632ef1b75..a018bd5d09d9a 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -46,6 +46,13 @@ class SupportsMultiModal(Protocol):
         MRO of your model class.
     """
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        """
+        Get the placeholder text for the `i`th `modality` item in the prompt.
+        """
+        ...
+
     def get_multimodal_embeddings(self,
                                   **kwargs: object) -> MultiModalEmbeddings:
         """
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 6abe6cd6965c8..f8b9ea2c5b6a0 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -1023,6 +1023,15 @@ class InternVLMultiModalProcessor(
 class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
                         SupportsLoRA):
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+        if modality.startswith("video"):
+            return "<video>"
+
+        raise ValueError("Only image or video modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
 
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 07dcc3a72ad35..34cd26b4c062a 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -1343,6 +1343,15 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
         "model.": "language_model.model.",
     })
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|vision_start|><|image_pad|><|vision_end|>"
+        if modality.startswith("video"):
+            return "<|vision_start|><|video_pad|><|vision_end|>"
+
+        raise ValueError("Only image or video modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config: PretrainedConfig = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py
index f32c2075f6a85..9c0a6ba92389b 100644
--- a/vllm/model_executor/models/kimi_vl.py
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -264,6 +264,13 @@ class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
                                         dummy_inputs=KimiVLDummyInputsBuilder)
 class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal):
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|media_start|>image<|media_content|><|media_pad|><|media_end|>"
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(
         self,
         vllm_config: VllmConfig,
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 7a7aefb267181..0126ace09e707 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -511,6 +511,13 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
             "lm_head.": "language_model.lm_head.",
         })
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
 
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 142d5740f077f..04fb6b5736a5c 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -215,6 +215,13 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
             "lm_head.": "language_model.lm_head.",
         })
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index f930f3ce8a16f..a96df0b6f572e 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -281,6 +281,15 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
             "lm_head.": "language_model.lm_head.",
         })
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+        if modality.startswith("video"):
+            return "<video>"
+
+        raise ValueError("Only image or video modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 7ff1026bfc94d..ecd24af030a14 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -446,6 +446,15 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
             "lm_head.": "language_model.lm_head.",
         })
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+        if modality.startswith("video"):
+            return "<video>"
+
+        raise ValueError("Only image or video modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index 112e0b91d3f17..71593d4bb8967 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -511,6 +511,17 @@ class MiniCPMO(MiniCPMV2_6):
         ],
     }
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "(<image>./</image>)"
+        if modality.startswith("video"):
+            return "(<video>./</video>)"
+        if modality.startswith("audio"):
+            return "(<audio>./</audio>)"
+
+        raise ValueError("Only image, video or audio modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
         self.apm = self.init_audio_module(vllm_config=vllm_config,
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 1dba88be83500..70f2d4a6420b9 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -735,6 +735,15 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
     instantiated.
     """
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "(<image>./</image>)"
+        if modality.startswith("video"):
+            return "(<video>./</video>)"
+
+        raise ValueError("Only image or video modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         multimodal_config = vllm_config.model_config.multimodal_config
diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py
index a125454c0c060..9aba82cb115ed 100644
--- a/vllm/model_executor/models/minimax_vl_01.py
+++ b/vllm/model_executor/models/minimax_vl_01.py
@@ -158,6 +158,13 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal,
         "gate_up_proj": ["gate_proj", "up_proj"]
     }
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
 
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 6840c672a3299..88c3823eaa193 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -401,6 +401,13 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
             "lm_head.": "language_model.lm_head.",
         })
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return None
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
 
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index ead5a8e950f0f..30ae3f26c8e45 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1276,6 +1276,13 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
         },
     )
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|image|>"
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config: MllamaConfig = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index ea781e18db272..1276d626a7c39 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -719,6 +719,13 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
     }
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|image|>"
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index bb08cd59f6fcf..78dc0dca957f0 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1366,6 +1366,13 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
         "merged_linear": ["gate_proj", "up_proj"]  # image_projector
     }
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return None
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py
index 5059b4e69f076..111628d8d18cb 100644
--- a/vllm/model_executor/models/ovis.py
+++ b/vllm/model_executor/models/ovis.py
@@ -405,6 +405,13 @@ class OvisMultiModalProcessor(BaseMultiModalProcessor[OvisProcessingInfo]):
                                         dummy_inputs=OvisDummyInputsBuilder)
 class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 29ffb62eeafd0..77197abe57109 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -240,6 +240,13 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
             "lm_head.": "language_model.lm_head.",
         })
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return None
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index a084e71f734c2..745cf7aa25129 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -520,6 +520,13 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
             "model.": "language_model.model.",
         })
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return f"<|image_{i}|>"
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 3c4162507f03d..9b61c3634d841 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -902,6 +902,15 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
         },
     )
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return f"<|image_{i}|>"
+        if modality.startswith("audio"):
+            return f"<|audio_{i}|>"
+
+        raise ValueError("Only image or audio modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index a31c757f7d592..475d65a58b2a2 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -327,6 +327,13 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
 class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
                                       SupportsPP):
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return None
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index f89cf1b5274cf..a36f24bc80ec7 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -118,6 +118,13 @@ class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal,
                            SupportsV0Only):
     """ Prithvi Masked Autoencoder"""
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return None
+
+        raise ValueError("Only image modality is supported")
+
     def _instantiate_model(self, config: dict) -> Optional[nn.Module]:
 
         # We might be able/need to support different tasks with this same model
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 8980f386502fc..377a34f2088a8 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -717,6 +717,17 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
             "thinker.": "",
         })
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|vision_start|><|IMAGE|><|vision_end|>"
+        if modality.startswith("video"):
+            return "<|vision_start|><|VIDEO|><|vision_end|>"
+        if modality.startswith("audio"):
+            return f"Audio {i}: <|audio_bos|><|AUDIO|><|audio_eos|>"
+
+        raise ValueError("Only image, video or audio modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         thinker_config: Qwen2_5OmniThinkerConfig = (
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 1b64b61a1e5cf..42a87c4a796e9 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -835,6 +835,15 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             "model.": "language_model.model.",
         })
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|vision_start|><|image_pad|><|vision_end|>"
+        if modality.startswith("video"):
+            return "<|vision_start|><|video_pad|><|vision_end|>"
+
+        raise ValueError("Only image or video modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 31b25ef0bc731..d7fec30acd8d3 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -251,6 +251,13 @@ class Qwen2AudioMultiModalProcessor(
 class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
                                          SupportsPP):
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("audio"):
+            return f"Audio {i}: <|audio_bos|><|AUDIO|><|audio_eos|>"
+
+        raise ValueError("Only audio modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index dc7b08c65bb13..41b38b855ebf9 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1096,6 +1096,15 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             "model.": "language_model.model.",
         })
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|vision_start|><|image_pad|><|vision_end|>"
+        if modality.startswith("video"):
+            return "<|vision_start|><|video_pad|><|vision_end|>"
+
+        raise ValueError("Only image or video modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config: Qwen2VLConfig = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 563650a4f162c..4c3fd6b5156d0 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -675,6 +675,13 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
             connector="transformer.visual.attn_pool",
             tower_model="transformer.visual.transformer")
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return f"Picture {i}: <img></img>"
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(
         self,
         *,
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index d362838dbb398..5ae5c0bc1d5dc 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -648,6 +648,13 @@ class SkyworkR1VProcessingInfo(BaseSkyworkR1VProcessingInfo):
     dummy_inputs=SkyworkR1VDummyInputsBuilder)
 class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
 
diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py
index a5736f124f259..c2d3d0114c236 100644
--- a/vllm/model_executor/models/tarsier.py
+++ b/vllm/model_executor/models/tarsier.py
@@ -393,6 +393,13 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal,
         "gate_up_proj": ["gate_proj", "up_proj"]
     }
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+
+        raise ValueError("Only image modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
         config: TarsierHfConfig = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 5cccd6b8841b4..3697e3fd0cf43 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -407,6 +407,13 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."})
 
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("audio"):
+            return "<|audio|>"
+
+        raise ValueError("Only audio modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 568b81c4bbfa8..27b3c75513fbf 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -761,6 +761,35 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
         ".fc2.": ".mlp.fc2."
     })
 
+    @classmethod
+    def validate_language(cls, language: str) -> bool:
+        if language in ISO639_1_SUPPORTED_LANGS:
+            return True
+        elif language in ISO639_1_OTHER_LANGS:
+            logger.warning(
+                "The selected language %s has limited accuracy with"
+                " reported WER>=0.5. Results may be less accurate "
+                "for this choice.", language)
+            return True
+        else:
+            raise ValueError(f"Unsupported language: {language}."
+                             "Language should be one of:" +
+                             f" {list(ISO639_1_SUPPORTED_LANGS.values())}" +
+                             f"or {list(ISO639_1_OTHER_LANGS.values())}")
+
+    @classmethod
+    def get_decoder_prompt(cls, language: str, task_type: str,
+                           prompt: str) -> str:
+        return (f"<|startoftranscript|><|{language}|><|{task_type}|>"
+                f"<|notimestamps|>{prompt}")
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("audio"):
+            return None
+
+        raise ValueError("Only audio modality is supported")
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -840,28 +869,6 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
         weights = _create_fake_bias_for_k_proj(weights)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
-    @classmethod
-    def validate_language(cls, language: str) -> bool:
-        if language in ISO639_1_SUPPORTED_LANGS:
-            return True
-        elif language in ISO639_1_OTHER_LANGS:
-            logger.warning(
-                "The selected language %s has limited accuracy with"
-                " reported WER>=0.5. Results may be less accurate "
-                "for this choice.", language)
-            return True
-        else:
-            raise ValueError(f"Unsupported language: {language}."
-                             "Language should be one of:" +
-                             f" {list(ISO639_1_SUPPORTED_LANGS.values())}" +
-                             f"or {list(ISO639_1_OTHER_LANGS.values())}")
-
-    @classmethod
-    def get_decoder_prompt(cls, language: str, task_type: str,
-                           prompt: str) -> str:
-        return (f"<|startoftranscript|><|{language}|><|{task_type}|>"
-                f"<|notimestamps|>{prompt}")
-
 
 def _create_fake_bias_for_k_proj(
     weights: Iterable[tuple[str, torch.Tensor]]

From fb14d53cf6e68866cc13e348db18f361da8fb7ec Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Thu, 3 Jul 2025 16:39:14 +0800
Subject: [PATCH 070/167] [Kernel] refactor cpu worker v0 cache dtype (#20080)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 vllm/worker/cpu_worker.py | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index ff110e050bb6f..a8998127b60f3 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -18,7 +18,7 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, bind_kv_cache
+from vllm.utils import bind_kv_cache
 from vllm.worker.cpu_enc_dec_model_runner import CPUEncoderDecoderModelRunner
 from vllm.worker.cpu_model_runner import CPUModelRunner, CPUModelRunnerBase
 from vllm.worker.cpu_pooling_model_runner import CPUPoolingModelRunner
@@ -54,13 +54,8 @@ class CPUCacheEngine:
         # in the scheduler.
         self.num_cpu_blocks = cache_config.num_gpu_blocks
 
-        if cache_config.cache_dtype == "auto":
-            self.dtype = model_config.dtype
-        elif cache_config.cache_dtype in ["fp8", "fp8_e5m2"]:
-            self.dtype = torch.float8_e5m2
-        else:
-            raise NotImplementedError(f"Unsupported KV cache type "
-                                      f"{cache_config.cache_dtype}.")
+        self.dtype = CPUCacheEngine.get_kv_cache_dtype(cache_config,
+                                                       model_config)
 
         # Get attention backend.
         self.attn_backend = get_attn_backend(
@@ -97,10 +92,20 @@ class CPUCacheEngine:
     def copy(self, src_to_dsts: torch.Tensor) -> None:
         self.attn_backend.copy_blocks(self.cpu_cache, src_to_dsts)
 
+    @staticmethod
+    def get_kv_cache_dtype(cache_config: CacheConfig,
+                           model_config: ModelConfig):
+        if cache_config.cache_dtype == "auto":
+            return model_config.dtype
+        elif cache_config.cache_dtype in ["fp8", "fp8_e5m2"]:
+            return torch.float8_e5m2
+        else:
+            raise NotImplementedError(f"Unsupported KV cache type "
+                                      f"{cache_config.cache_dtype}.")
+
     @staticmethod
     def get_cache_block_size(
-        block_size: int,
-        cache_dtype: str,
+        cache_config: CacheConfig,
         model_config: ModelConfig,
         parallel_config: ParallelConfig,
     ) -> int:
@@ -108,13 +113,10 @@ class CPUCacheEngine:
         num_heads = model_config.get_num_kv_heads(parallel_config)
         num_layers = model_config.get_num_layers(parallel_config)
 
-        key_cache_block = block_size * num_heads * head_size
+        key_cache_block = cache_config.block_size * num_heads * head_size
         value_cache_block = key_cache_block if not model_config.use_mla else 0
         total = num_layers * (key_cache_block + value_cache_block)
-        if cache_dtype == "auto":
-            dtype = model_config.dtype
-        else:
-            dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype]
+        dtype = CPUCacheEngine.get_kv_cache_dtype(cache_config, model_config)
         dtype_size = torch.tensor([], dtype=dtype).element_size()
         return dtype_size * total
 
@@ -399,9 +401,9 @@ class CPUWorker(LocalOrDistributedWorkerBase):
     def get_cache_block_size_bytes(self) -> int:
         """Return the size in bytes of a single KV cache block.
         """
-        return CPUCacheEngine.get_cache_block_size(
-            self.cache_config.block_size, self.cache_config.cache_dtype,
-            self.model_config, self.parallel_config)
+        return CPUCacheEngine.get_cache_block_size(self.cache_config,
+                                                   self.model_config,
+                                                   self.parallel_config)
 
     def get_cpus_id_binding_based_on_numa_nodes(self) -> str:
         """Return CPUs id binding based on NUMA nodes.

From 7f0367109ebb6274e1e5ae7102948ebc730d4e60 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Thu, 3 Jul 2025 20:26:12 +0800
Subject: [PATCH 071/167] [CI/Build][CPU] Enable cross compilation in CPU
 release pipeline (#20423)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .buildkite/release-pipeline.yaml |  2 +-
 cmake/cpu_extension.cmake        | 24 +++++++++++++++++-------
 docker/Dockerfile.cpu            | 10 +++++++++-
 3 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 55678b8936e04..ee13e1aabc891 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -101,7 +101,7 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
     env:
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 264c970ef784a..fc7291972309a 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -12,9 +12,8 @@ endif()
 #
 # Define environment variables for special configurations
 #
-if(DEFINED ENV{VLLM_CPU_AVX512BF16})
-    set(ENABLE_AVX512BF16 ON)
-endif()
+set(ENABLE_AVX512BF16 $ENV{VLLM_CPU_AVX512BF16})
+set(ENABLE_AVX512VNNI $ENV{VLLM_CPU_AVX512VNNI})
 
 include_directories("${CMAKE_SOURCE_DIR}/csrc")
 
@@ -107,10 +106,19 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
     endif()
 
     find_isa(${CPUINFO} "avx512_vnni" AVX512VNNI_FOUND)
-    if (AVX512VNNI_FOUND)
-        list(APPEND CXX_COMPILE_FLAGS "-mavx512vnni")
-        set(ENABLE_AVX512VNNI ON) 
-    endif() 
+    if (AVX512VNNI_FOUND OR ENABLE_AVX512VNNI)
+        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
+            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
+            list(APPEND CXX_COMPILE_FLAGS "-mavx512vnni")
+            set(ENABLE_AVX512VNNI ON)
+        else()
+            set(ENABLE_AVX512VNNI OFF)
+            message(WARNING "Disable AVX512-VNNI ISA support, requires gcc/g++ >= 12.3")
+        endif()
+    else()
+        set(ENABLE_AVX512VNNI OFF)
+        message(WARNING "Disable AVX512-VNNI ISA support, no avx512_vnni found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512VNNI=1.")
+    endif()
     
 elseif (AVX2_FOUND)
     list(APPEND CXX_COMPILE_FLAGS "-mavx2")
@@ -257,6 +265,8 @@ elseif(POWER10_FOUND)
         ${VLLM_EXT_SRC})
 endif()
 
+message(STATUS "CPU extension source files: ${VLLM_EXT_SRC}")
+
 #
 # Define extension targets
 #
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 0a756ea7298c0..5da2c9467bfc6 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -8,6 +8,8 @@
 # Build arguments:
 #   PYTHON_VERSION=3.12 (default)|3.11|3.10|3.9
 #   VLLM_CPU_DISABLE_AVX512=false (default)|true
+#   VLLM_CPU_AVX512BF16=false (default)|true
+#   VLLM_CPU_AVX512VNNI=false (default)|true
 #
 
 ######################### BASE IMAGE #########################
@@ -60,8 +62,14 @@ FROM base AS vllm-build
 
 ARG GIT_REPO_CHECK=0
 # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
-ARG VLLM_CPU_DISABLE_AVX512
+ARG VLLM_CPU_DISABLE_AVX512=0
 ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
+# Support for building with AVX512BF16 ISA: docker build --build-arg VLLM_CPU_AVX512BF16="true" ...
+ARG VLLM_CPU_AVX512BF16=0
+ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16}
+# Support for building with AVX512VNNI ISA: docker build --build-arg VLLM_CPU_AVX512VNNI="true" ...
+ARG VLLM_CPU_AVX512VNNI=0
+ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI}
 
 WORKDIR /workspace/vllm
 

From 1819fbda638d4e10512570ebad0e6b16661238d8 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 3 Jul 2025 21:58:46 +0800
Subject: [PATCH 072/167] [Quantization] Bump to use latest bitsandbytes
 (#20424)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 docker/Dockerfile                                       | 2 +-
 docs/features/quantization/bnb.md                       | 2 +-
 requirements/nightly_torch_test.txt                     | 2 +-
 requirements/test.in                                    | 2 +-
 requirements/test.txt                                   | 2 +-
 vllm/config.py                                          | 2 +-
 vllm/model_executor/layers/quantization/bitsandbytes.py | 8 ++++----
 vllm/model_executor/model_loader/bitsandbytes_loader.py | 8 ++++----
 8 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index d1009fb4fb18b..ec18c45a096a1 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -498,7 +498,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
         uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
     else \
-        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.46.1' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
     fi
 
 ENV VLLM_USAGE_SOURCE production-docker-image
diff --git a/docs/features/quantization/bnb.md b/docs/features/quantization/bnb.md
index 5756fdb288374..ca13ee107ef43 100644
--- a/docs/features/quantization/bnb.md
+++ b/docs/features/quantization/bnb.md
@@ -10,7 +10,7 @@ Compared to other quantization methods, BitsAndBytes eliminates the need for cal
 Below are the steps to utilize BitsAndBytes with vLLM.
 
 ```bash
-pip install bitsandbytes>=0.45.3
+pip install bitsandbytes>=0.46.1
 ```
 
 vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
index fd0b0fac12a92..0bade084fdf6f 100644
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -34,7 +34,7 @@ tokenizers==0.21.1
 huggingface-hub[hf_xet]>=0.30.0  # Required for Xet downloads.
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
-bitsandbytes>=0.45.3
+bitsandbytes>=0.46.1
 buildkite-test-collector==0.1.9
 
 
diff --git a/requirements/test.in b/requirements/test.in
index 85c96df8e8f4c..5f8b97a0e3415 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -39,7 +39,7 @@ tokenizers==0.21.1
 huggingface-hub[hf_xet]>=0.33.0  # Required for Xet downloads.
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
-bitsandbytes>=0.45.3
+bitsandbytes==0.46.1
 buildkite-test-collector==0.1.9
 
 
diff --git a/requirements/test.txt b/requirements/test.txt
index 16d8ee54adcff..f6f599df758f1 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -45,7 +45,7 @@ backoff==2.2.1
     # via
     #   -r requirements/test.in
     #   schemathesis
-bitsandbytes==0.45.3
+bitsandbytes==0.46.1
     # via -r requirements/test.in
 black==24.10.0
     # via datamodel-code-generator
diff --git a/vllm/config.py b/vllm/config.py
index 5c19061a0d514..8a0080f79702e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -969,7 +969,7 @@ class ModelConfig:
 
     def _verify_bnb_config(self) -> None:
         """
-        The current version of bitsandbytes (0.45.3) with 8-bit models does not
+        The current version of bitsandbytes (0.46.1) with 8-bit models does not
         yet support CUDA graph.
         # TODO Remove this when bitsandbytes supports.
         """
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 53ba84ea8e754..1ed3ef8d21733 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -156,12 +156,12 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
     def __init__(self, quant_config: BitsAndBytesConfig):
         try:
             import bitsandbytes
-            if bitsandbytes.__version__ < "0.45.3":
+            if bitsandbytes.__version__ < "0.46.1":
                 raise ImportError("bitsandbytes version is wrong. Please "
-                                  "install bitsandbytes>=0.45.3.")
+                                  "install bitsandbytes>=0.46.1.")
         except ImportError as err:
-            raise ImportError("Please install bitsandbytes>=0.45.3 via "
-                              "`pip install bitsandbytes>=0.45.3` to use "
+            raise ImportError("Please install bitsandbytes>=0.46.1 via "
+                              "`pip install bitsandbytes>=0.46.1` to use "
                               "bitsandbytes quantizer.") from err
 
         self.quant_config = quant_config
diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py
index 0c46d170e88d5..8e330f7eeaf4b 100644
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -183,12 +183,12 @@ class BitsAndBytesModelLoader(BaseModelLoader):
         try:
             import bitsandbytes
 
-            if bitsandbytes.__version__ < "0.45.3":
+            if bitsandbytes.__version__ < "0.46.1":
                 raise ImportError("bitsandbytes version is wrong. Please "
-                                  "install bitsandbytes>=0.45.3.")
+                                  "install bitsandbytes>=0.46.1.")
         except ImportError as err:
-            raise ImportError("Please install bitsandbytes>=0.45.3 via "
-                              "`pip install bitsandbytes>=0.45.3` to use "
+            raise ImportError("Please install bitsandbytes>=0.46.1 via "
+                              "`pip install bitsandbytes>=0.46.1` to use "
                               "bitsandbytes quantizer.") from err
 
         hf_weights_files, use_safetensors = self._prepare_weights(

From 6f1229f91ddbc8c12b77929b4eae4006d3a46ca5 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Thu, 3 Jul 2025 21:59:23 +0800
Subject: [PATCH 073/167] [Model][2/N] Automatic conversion of CrossEncoding
 model (#19978)

Signed-off-by: wang.yuqi <noooop@126.com>
---
 docs/models/supported_models.md               |  2 +-
 docs/serving/openai_compatible_server.md      |  2 +-
 .../openai/correctness/test_mteb_score.py     | 25 ++++----
 tests/models/language/pooling/mteb_utils.py   | 56 ++++++++++--------
 tests/models/test_registry.py                 |  8 +--
 tests/test_config.py                          | 28 ++++++++-
 vllm/config.py                                | 32 +++++++----
 vllm/entrypoints/llm.py                       | 10 +++-
 vllm/entrypoints/openai/api_server.py         | 19 ++++---
 vllm/entrypoints/openai/run_batch.py          |  6 +-
 vllm/model_executor/layers/pooler.py          | 23 ++++++--
 vllm/model_executor/model_loader/utils.py     |  7 ++-
 vllm/model_executor/models/adapters.py        | 57 ++++++++++++++-----
 vllm/model_executor/models/qwen2.py           |  4 ++
 vllm/model_executor/models/registry.py        |  6 +-
 vllm/outputs.py                               |  6 +-
 16 files changed, 199 insertions(+), 92 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 6ccfdcd89ddc1..eb32aa361efde 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -471,7 +471,7 @@ Specified using `--task classify`.
 | `JambaForSequenceClassification` | Jamba    | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎                     | ✅︎                          |                       |
 | `GPT2ForSequenceClassification`  | GPT2     | `nie3e/sentiment-polish-gpt2-small`    |                        |                             | ✅︎                     |
 If your model is not in the above list, we will try to automatically convert the model using
-[as_classification_model][vllm.model_executor.models.adapters.as_classification_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
+[as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
 
 #### Sentence Pair Scoring
 
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index e9c0a902a070e..5371e45d80529 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -426,7 +426,7 @@ Code example: <gh-file:examples/online_serving/openai_pooling_client.py>
 
 Our Classification API directly supports Hugging Face sequence-classification models such as [ai21labs/Jamba-tiny-reward-dev](https://huggingface.co/ai21labs/Jamba-tiny-reward-dev) and [jason9693/Qwen2.5-1.5B-apeach](https://huggingface.co/jason9693/Qwen2.5-1.5B-apeach).
 
-We automatically wrap any other transformer via `as_classification_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities.
+We automatically wrap any other transformer via `as_seq_cls_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities.
 
 Code example: <gh-file:examples/online_serving/openai_classification_client.py>
 
diff --git a/tests/entrypoints/openai/correctness/test_mteb_score.py b/tests/entrypoints/openai/correctness/test_mteb_score.py
index f90fc0b9be002..05e953de4a0fc 100644
--- a/tests/entrypoints/openai/correctness/test_mteb_score.py
+++ b/tests/entrypoints/openai/correctness/test_mteb_score.py
@@ -6,19 +6,16 @@ import pytest
 
 # yapf conflicts with isort for this block
 # yapf: disable
-from tests.models.language.pooling.mteb_utils import (MTEB_RERANK_LANGS,
-                                                      MTEB_RERANK_TASKS,
-                                                      MTEB_RERANK_TOL,
-                                                      RerankClientMtebEncoder,
-                                                      ScoreClientMtebEncoder,
-                                                      run_mteb_rerank)
+from tests.models.language.pooling.mteb_utils import (
+    MTEB_RERANK_LANGS, MTEB_RERANK_TASKS, MTEB_RERANK_TOL,
+    RerankClientMtebEncoder, ScoreClientMtebEncoder,
+    mteb_test_rerank_models_hf, run_mteb_rerank)
 # yapf: enable
 from tests.utils import RemoteOpenAIServer
 
 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
 
 MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
-MAIN_SCORE = 0.33437
 
 
 @pytest.fixture(scope="module")
@@ -31,12 +28,19 @@ def server():
         yield remote_server
 
 
-def test_mteb_score(server):
+@pytest.fixture(scope="module")
+def st_main_score(hf_runner):
+    # The main score related to the version of the dependency.
+    # So we need to recalculate every time.
+    main_score, st_dtype = mteb_test_rerank_models_hf(hf_runner, MODEL_NAME)
+    return main_score
+
+
+def test_mteb_score(server, st_main_score):
     url = server.url_for("score")
     encoder = ScoreClientMtebEncoder(MODEL_NAME, url)
     vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
                                       MTEB_RERANK_LANGS)
-    st_main_score = MAIN_SCORE
 
     print("VLLM main score: ", vllm_main_score)
     print("SentenceTransformer main score: ", st_main_score)
@@ -45,12 +49,11 @@ def test_mteb_score(server):
     assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
 
 
-def test_mteb_rerank(server):
+def test_mteb_rerank(server, st_main_score):
     url = server.url_for("rerank")
     encoder = RerankClientMtebEncoder(MODEL_NAME, url)
     vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
                                       MTEB_RERANK_LANGS)
-    st_main_score = MAIN_SCORE
 
     print("VLLM main score: ", vllm_main_score)
     print("SentenceTransformer main score: ", st_main_score)
diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py
index 0284e69f3f0e2..a83d258185845 100644
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -234,6 +234,35 @@ def run_mteb_rerank(cross_encoder, tasks, languages):
     return main_score
 
 
+def mteb_test_rerank_models_hf(hf_runner, model_name, hf_model_callback=None):
+    with hf_runner(model_name, is_cross_encoder=True,
+                   dtype="float32") as hf_model:
+
+        original_predict = hf_model.predict
+
+        def _predict(
+            sentences: list[tuple[str, str,
+                                  Optional[str]]],  # query, corpus, prompt
+            *args,
+            **kwargs,
+        ):
+            # vllm and st both remove the prompt, fair comparison.
+            prompts = [(s[0], s[1]) for s in sentences]
+            return original_predict(prompts, *args, **kwargs, batch_size=8)
+
+        hf_model.predict = _predict
+        hf_model.original_predict = original_predict
+
+        if hf_model_callback is not None:
+            hf_model_callback(hf_model)
+
+        st_main_score = run_mteb_rerank(hf_model,
+                                        tasks=MTEB_RERANK_TASKS,
+                                        languages=MTEB_RERANK_LANGS)
+        st_dtype = next(hf_model.model.model.parameters()).dtype
+    return st_main_score, st_dtype
+
+
 def mteb_test_rerank_models(hf_runner,
                             vllm_runner,
                             model_info: RerankModelInfo,
@@ -264,31 +293,8 @@ def mteb_test_rerank_models(hf_runner,
                                           languages=MTEB_RERANK_LANGS)
         vllm_dtype = model_config.dtype
 
-    with hf_runner(model_info.name, is_cross_encoder=True,
-                   dtype="float32") as hf_model:
-
-        original_predict = hf_model.predict
-
-        def _predict(
-            sentences: list[tuple[str, str,
-                                  Optional[str]]],  # query, corpus, prompt
-            *args,
-            **kwargs,
-        ):
-            # vllm and st both remove the prompt, fair comparison.
-            prompts = [(s[0], s[1]) for s in sentences]
-            return original_predict(prompts, *args, **kwargs, batch_size=8)
-
-        hf_model.predict = _predict
-        hf_model.original_predict = original_predict
-
-        if hf_model_callback is not None:
-            hf_model_callback(hf_model)
-
-        st_main_score = run_mteb_rerank(hf_model,
-                                        tasks=MTEB_RERANK_TASKS,
-                                        languages=MTEB_RERANK_LANGS)
-        st_dtype = next(hf_model.model.model.parameters()).dtype
+    st_main_score, st_dtype = mteb_test_rerank_models_hf(
+        hf_runner, model_info.name, hf_model_callback)
 
     print("VLLM:", vllm_dtype, vllm_main_score)
     print("SentenceTransformers:", st_dtype, st_main_score)
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index b7527ca2706b6..01b2260abe8cf 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -9,9 +9,9 @@ import torch.cuda
 from vllm.model_executor.models import (is_pooling_model,
                                         is_text_generation_model,
                                         supports_multimodal)
-from vllm.model_executor.models.adapters import (as_classification_model,
-                                                 as_embedding_model,
-                                                 as_reward_model)
+from vllm.model_executor.models.adapters import (as_embedding_model,
+                                                 as_reward_model,
+                                                 as_seq_cls_model)
 from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS,
                                                  _SPECULATIVE_DECODING_MODELS,
                                                  _TEXT_GENERATION_MODELS,
@@ -38,7 +38,7 @@ def test_registry_imports(model_arch):
         assert is_text_generation_model(model_cls)
 
     # All vLLM models should be convertible to a pooling model
-    assert is_pooling_model(as_classification_model(model_cls))
+    assert is_pooling_model(as_seq_cls_model(model_cls))
     assert is_pooling_model(as_embedding_model(model_cls))
     assert is_pooling_model(as_reward_model(model_cls))
 
diff --git a/tests/test_config.py b/tests/test_config.py
index 2e0222fa67a46..6ed7ef9e6a40d 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -52,7 +52,7 @@ def test_get_field():
         ("distilbert/distilgpt2", "generate", "generate"),
         ("intfloat/multilingual-e5-small", "pooling", "embed"),
         ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
-        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "score"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify"),
         ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
         ("openai/whisper-small", "transcription", "transcription"),
     ],
@@ -72,6 +72,32 @@ def test_auto_task(model_id, expected_runner_type, expected_task):
     assert config.task == expected_task
 
 
+@pytest.mark.parametrize(
+    ("model_id", "expected_runner_type", "expected_task"),
+    [
+        ("distilbert/distilgpt2", "pooling", "embed"),
+        ("intfloat/multilingual-e5-small", "pooling", "embed"),
+        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify"),
+        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed"),
+        ("openai/whisper-small", "pooling", "embed"),
+    ],
+)
+def test_score_task(model_id, expected_runner_type, expected_task):
+    config = ModelConfig(
+        model_id,
+        task="score",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+    )
+
+    assert config.runner_type == expected_runner_type
+    assert config.task == expected_task
+
+
 @pytest.mark.parametrize(("model_id", "bad_task"), [
     ("Qwen/Qwen2.5-Math-RM-72B", "generate"),
 ])
diff --git a/vllm/config.py b/vllm/config.py
index 8a0080f79702e..226a1014fa720 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -93,14 +93,14 @@ ConfigT = TypeVar("ConfigT", bound=ConfigType)
 TaskOption = Literal["auto", "generate", "embedding", "embed", "classify",
                      "score", "reward", "transcription"]
 
-_ResolvedTask = Literal["generate", "embed", "classify", "score", "reward",
-                        "draft", "transcription"]
+_ResolvedTask = Literal["generate", "embed", "classify", "reward", "draft",
+                        "transcription"]
 
 RunnerType = Literal["generate", "pooling", "draft", "transcription"]
 
 _RUNNER_TASKS: dict[RunnerType, list[_ResolvedTask]] = {
     "generate": ["generate"],
-    "pooling": ["embed", "classify", "score", "reward"],
+    "pooling": ["embed", "classify", "reward"],
     "draft": ["draft"],
     "transcription": ["transcription"],
 }
@@ -777,7 +777,7 @@ class ModelConfig:
         if get_pooling_config(model_id, self.revision):
             return "embed"
         if self.registry.is_cross_encoder_model(architectures):
-            return "score"
+            return "classify"
         if self.registry.is_transcription_model(architectures):
             return "transcription"
 
@@ -841,14 +841,24 @@ class ModelConfig:
                     "This model supports multiple tasks: %s. "
                     "Defaulting to '%s'.", supported_tasks, selected_task)
         else:
-            # Aliases
-            if task_option == "embedding":
-                msg = ("The 'embedding' task has been renamed to "
-                       "'embed', please use the new name. The old name "
-                       "will be removed in v1.0.")
-                warnings.warn(msg, DeprecationWarning, stacklevel=2)
+            if task_option == "score":
+                if not runner_support["pooling"]:
+                    msg = (f"This model does not support the '{task_option}' "
+                           f"task. Supported tasks: {supported_tasks}")
+                    raise ValueError(msg)
+                if self.registry.is_cross_encoder_model(architectures):
+                    task_option = "classify"
+                else:
+                    task_option = "embed"
+            else:
+                # Aliases
+                if task_option == "embedding":
+                    msg = ("The 'embedding' task has been renamed to "
+                           "'embed', please use the new name. The old name "
+                           "will be removed in v1.0.")
+                    warnings.warn(msg, DeprecationWarning, stacklevel=2)
 
-                task_option = "embed"
+                    task_option = "embed"
 
             if task_option not in supported_tasks:
                 msg = (
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 93692a0c9772e..68f126c042836 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1289,9 +1289,13 @@ class LLM:
 
             raise ValueError(" ".join(messages))
 
-        if self.llm_engine.model_config.task not in ("embed", "score"):
-            raise ValueError(
-                "Score API is only enabled for `--task embed or --task score`")
+        if self.llm_engine.model_config.task not in ("embed", "classify"):
+            raise ValueError("Score API is only enabled for "
+                             "`--task embed or --task classify`.")
+
+        if (self.llm_engine.model_config.task == "classify"
+                and self.llm_engine.model_config.hf_config.num_labels != 1):
+            raise ValueError("Score API is only enabled for num_labels == 1.")
 
         # the tokenizer for models such as
         # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 6da9935efb30c..917b3bbbb982f 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1311,24 +1311,27 @@ async def init_app_state(
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
     ) if model_config.task == "embed" else None
-    state.openai_serving_scores = ServingScores(
-        engine_client,
-        model_config,
-        state.openai_serving_models,
-        request_logger=request_logger) if model_config.task in (
-            "score", "embed", "pooling") else None
     state.openai_serving_classification = ServingClassification(
         engine_client,
         model_config,
         state.openai_serving_models,
         request_logger=request_logger,
     ) if model_config.task == "classify" else None
+
+    enable_serving_reranking = (model_config.task == "classify" and getattr(
+        model_config.hf_config, "num_labels", 0) == 1)
     state.jinaai_serving_reranking = ServingScores(
         engine_client,
         model_config,
         state.openai_serving_models,
-        request_logger=request_logger
-    ) if model_config.task == "score" else None
+        request_logger=request_logger) if enable_serving_reranking else None
+    state.openai_serving_scores = ServingScores(
+        engine_client,
+        model_config,
+        state.openai_serving_models,
+        request_logger=request_logger) if (
+            model_config.task == "embed" or enable_serving_reranking) else None
+
     state.openai_serving_tokenization = OpenAIServingTokenization(
         engine_client,
         model_config,
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 29740fc7e602c..e112e2f893a09 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -357,12 +357,16 @@ async def main(args):
         chat_template=None,
         chat_template_content_format="auto",
     ) if model_config.task == "embed" else None
+
+    enable_serving_reranking = (model_config.task == "classify" and getattr(
+        model_config.hf_config, "num_labels", 0) == 1)
+
     openai_serving_scores = (ServingScores(
         engine,
         model_config,
         openai_serving_models,
         request_logger=request_logger,
-    ) if model_config.task == "score" else None)
+    ) if (model_config.task == "embed" or enable_serving_reranking) else None)
 
     tracker = BatchProgressTracker()
     logger.info("Reading batch from %s...", args.input_file)
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 8a33cd6be4054..48adcc5fef84a 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -285,6 +285,7 @@ class PoolerHead(nn.Module):
         else:
             pooled_data = pooled_data.to(torch.float32)
 
+        # for matryoshka representation
         if isinstance(pooling_metadata, V0PoolingMetadata):
             dimensions_list = [
                 pooling_param.dimensions
@@ -299,10 +300,16 @@ class PoolerHead(nn.Module):
         if any(d is not None for d in dimensions_list):
             # change the output dimension
             assert len(pooled_data) == len(dimensions_list)
-            pooled_data = [
-                vecs if d is None else vecs[..., :d]
-                for vecs, d in zip(pooled_data, dimensions_list)
-            ]
+            if len(set(dimensions_list)) == 1 and not isinstance(
+                    pooled_data, list):
+                # if all dimensions are the same
+                d = dimensions_list[0]
+                pooled_data = pooled_data[..., :d]
+            else:
+                pooled_data = [
+                    vecs if d is None else vecs[..., :d]
+                    for vecs, d in zip(pooled_data, dimensions_list)
+                ]
 
         if self.normalize:
             if isinstance(pooled_data, list):
@@ -325,6 +332,10 @@ class PoolerHead(nn.Module):
                 else:
                     pooled_data = F.sigmoid(pooled_data)
 
+        # shape:
+        # classify (& score) -> (batch_size, num_classes)
+        # embed -> (batch_size, embedding_dim) or list(embedding_dim)
+        #          (batch_size, dimensions) or list(dimensions) if using MRL
         return pooled_data
 
 
@@ -419,7 +430,6 @@ class ClassifierPooler(nn.Module):
                 offset += prompt_len
                 pooled_data.append(pooled_data_i)
 
-        offset = 0
         pooled_data_lst = []
         for pooled_data_i in pooled_data:
 
@@ -436,7 +446,8 @@ class ClassifierPooler(nn.Module):
             # apply classifier once on the full batch if possible
             pooled_output = self.classifier(pooled_output)
 
-        scores = self.default_activation_function(pooled_output).squeeze(-1)
+        # shape: (batch_size, num_labels)
+        scores = self.default_activation_function(pooled_output)
 
         pooled_outputs = [PoolingSequenceGroupOutput(data) for data in scores]
         return PoolerOutput(outputs=pooled_outputs)
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index ff596054a698a..792a1044a5640 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -21,8 +21,7 @@ from vllm.model_executor.layers.linear import QKVCrossParallelLinear
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.models import ModelRegistry
-from vllm.model_executor.models.adapters import (as_classification_model,
-                                                 as_embedding_model,
+from vllm.model_executor.models.adapters import (as_embedding_model,
                                                  as_reward_model)
 from vllm.model_executor.models.interfaces import SupportsQuant
 from vllm.utils import is_pin_memory_available
@@ -246,7 +245,9 @@ def get_model_architecture(
     if model_config.task == "embed":
         model_cls = as_embedding_model(model_cls)
     elif model_config.task == "classify":
-        model_cls = as_classification_model(model_cls)
+        # Cannot automatically run as_seq_cls_model,
+        # otherwise it will cause a circular reference on is_cross_encoder_model
+        pass
     elif model_config.task == "reward":
         model_cls = as_reward_model(model_cls)
 
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 1651e3e429e64..4611f6704e19b 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
-from typing import TYPE_CHECKING, Any, Optional, TypeVar
+from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
 
 import torch
 import torch.nn as nn
@@ -145,9 +145,9 @@ def as_embedding_model(cls: _T) -> _T:
     return ModelForEmbedding  # type: ignore
 
 
-def as_classification_model(cls: _T) -> _T:
+def as_seq_cls_model(cls: _T) -> _T:
     """
-    Subclass an existing vLLM model to support classification.
+    Subclass an existing vLLM model to support classify and score tasks.
 
     By default, the class probabilities are extracted from the softmaxed
     hidden state corresponding to the last token.
@@ -164,7 +164,9 @@ def as_classification_model(cls: _T) -> _T:
     # Lazy import
     from vllm.config import VllmConfig
     from vllm.model_executor.layers.linear import RowParallelLinear
-    from vllm.model_executor.layers.pooler import PoolingType
+    from vllm.model_executor.layers.pooler import PoolerOutput, PoolingType
+    from vllm.model_executor.models.interfaces import SupportsCrossEncoding
+    from vllm.model_executor.pooling_metadata import PoolingMetadata
     from vllm.sequence import IntermediateTensors
 
     from .utils import maybe_prefix
@@ -176,7 +178,8 @@ def as_classification_model(cls: _T) -> _T:
         default_softmax=True,
     )
 
-    class ModelForClassification(ModelForPooling):
+    class ModelForSequenceClassification(ModelForPooling,
+                                         SupportsCrossEncoding):
 
         def __init__(
             self,
@@ -190,6 +193,10 @@ def as_classification_model(cls: _T) -> _T:
             config = vllm_config.model_config.hf_config
             quant_config = vllm_config.quant_config
 
+            self.task = vllm_config.model_config.task
+            self.pooling_type = (
+                vllm_config.model_config.pooler_config.pooling_type)
+
             self.score = RowParallelLinear(config.hidden_size,
                                            config.num_labels,
                                            quant_config=quant_config,
@@ -205,17 +212,41 @@ def as_classification_model(cls: _T) -> _T:
             intermediate_tensors: Optional[IntermediateTensors] = None,
             inputs_embeds: Optional[torch.Tensor] = None,
         ) -> torch.Tensor:
-            hidden_states = super().forward(input_ids, positions,
-                                            intermediate_tensors,
-                                            inputs_embeds)
-            logits, _ = self.score(hidden_states)
-            return logits
+            return super().forward(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+
+        def pooler(
+            self,
+            hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+            pooling_metadata: PoolingMetadata,
+        ) -> PoolerOutput:
+
+            def get_logits(hidden_states):
+                if isinstance(hidden_states, list):
+                    logits = [self.score(state)[0] for state in hidden_states]
+                else:
+                    logits, _ = self.score(hidden_states)
+                return logits
+
+            if self.pooling_type == PoolingType.ALL:
+                logits = get_logits(hidden_states)
+                return self._pooler(logits, pooling_metadata)
+            else:
+                hidden_states = self._pooler.extract_states(
+                    hidden_states, pooling_metadata)
+                logits = get_logits(hidden_states)
+                pooled_data = self._pooler.head(logits, pooling_metadata)
+
+                pooled_outputs = [
+                    self._pooler.build_output(data) for data in pooled_data
+                ]
+                return PoolerOutput(outputs=pooled_outputs)
 
 
-    ModelForClassification.__name__ = \
-        _get_pooling_model_name(cls.__name__, "ForClassification")
+    ModelForSequenceClassification.__name__ = \
+        _get_pooling_model_name(cls.__name__, "ForSequenceClassification")
 
-    return ModelForClassification  # type: ignore
+    return ModelForSequenceClassification  # type: ignore
 
 
 def as_reward_model(cls: _T) -> _T:
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 23f65b99c22ce..7ef9d248da4b6 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -50,6 +50,7 @@ from vllm.model_executor.model_loader.weight_utils import (
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
+from .adapters import as_seq_cls_model
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
                     is_pp_missing_parameter,
@@ -495,3 +496,6 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                            if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
+
+
+Qwen2ForSequenceClassification = as_seq_cls_model(Qwen2ForCausalLM)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 2d169ecf6bb46..b100fe77e3772 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -164,8 +164,6 @@ _EMBEDDING_MODELS = {
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
-    # [Auto-converted (see adapters.py)]
-    "Qwen2ForSequenceClassification": ("qwen2", "Qwen2ForCausalLM"),
     # Technically PrithviGeoSpatialMAE is a model that works on images, both in
     # input and output. I am adding it here because it piggy-backs on embedding
     # models for the time being.
@@ -180,7 +178,9 @@ _CROSS_ENCODER_MODELS = {
                                             "RobertaForSequenceClassification"),
     "ModernBertForSequenceClassification": ("modernbert",
                                             "ModernBertForSequenceClassification"),
-    "Qwen3ForSequenceClassification": ("qwen3", "Qwen3ForSequenceClassification"), # noqa: E501
+    # [Auto-converted (see adapters.py)]
+    "Qwen2ForSequenceClassification": ("qwen2", "Qwen2ForSequenceClassification"), # noqa: E501
+    "Qwen3ForSequenceClassification": ("qwen3", "Qwen3ForSequenceClassification"),  # noqa: E501
 }
 
 _MULTIMODAL_MODELS = {
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 891305eb7936e..9784a8894472f 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -453,6 +453,7 @@ class ClassificationOutput:
 
     @staticmethod
     def from_base(pooling_output: PoolingOutput):
+        # pooling_output shape: (num_classes)
         pooled_data = pooling_output.data
         if pooled_data.ndim != 1:
             raise ValueError("pooled_data should be a 1-D probability vector")
@@ -490,7 +491,10 @@ class ScoringOutput:
 
     @staticmethod
     def from_base(pooling_output: PoolingOutput):
-        pooled_data = pooling_output.data
+        # pooling_output shape:
+        #   classify task: (num_classes) num_classes == 1
+        #   embed task: a scalar value
+        pooled_data = pooling_output.data.squeeze()
         if pooled_data.ndim != 0:
             raise ValueError("pooled_data should be a scalar score")
 

From ff5c60fad88327f0a4fa55cac44f50094e53ab4f Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 3 Jul 2025 22:11:03 +0800
Subject: [PATCH 074/167] [Misc] Automatically tag PRs to add new models
 (#20222)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .github/mergify.yml | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 6fb28a86fd8d4..20f3be8304a81 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -74,14 +74,25 @@ pull_request_rules:
       - files~=^vllm/multimodal/
       - files~=^tests/multimodal/
       - files~=^tests/models/multimodal/
-      - files~=^tests/models/*/audio_language/
-      - files~=^tests/models/*/vision_language/
       - files=tests/models/test_vision.py
   actions:
     label:
       add:
         - multi-modality
 
+- name: label-new-model
+  description: Automatically apply new-model label
+  conditions:
+    - and:
+      - files~=^vllm/model_executor/models/
+      - files=vllm/model_executor/models/registry.py
+      - files=tests/models/registry.py
+      - files=docs/models/supported_models.md
+  actions:
+    label:
+      add:
+        - new-model
+
 - name: label-performance
   description: Automatically apply performance label
   conditions:

From 9854dc904039eb9f5d51aac88fbdaf23b96e1da9 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Thu, 3 Jul 2025 22:22:16 +0800
Subject: [PATCH 075/167] [Frontend] improve vllm bench <bench_type> --help
 display (#20430)

Signed-off-by: reidliu41 <reid201711@gmail.com>
---
 vllm/entrypoints/cli/benchmark/main.py |  5 +++++
 vllm/entrypoints/cli/run_batch.py      |  2 +-
 vllm/entrypoints/cli/serve.py          |  2 +-
 vllm/entrypoints/utils.py              | 15 +++++++++++----
 4 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/vllm/entrypoints/cli/benchmark/main.py b/vllm/entrypoints/cli/benchmark/main.py
index 8904a2468b3ce..a9e703cf88890 100644
--- a/vllm/entrypoints/cli/benchmark/main.py
+++ b/vllm/entrypoints/cli/benchmark/main.py
@@ -8,6 +8,8 @@ import typing
 
 from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
 from vllm.entrypoints.cli.types import CLISubcommand
+from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG,
+                                    show_filtered_argument_or_group_from_help)
 
 if typing.TYPE_CHECKING:
     from vllm.utils import FlexibleArgumentParser
@@ -45,6 +47,9 @@ class BenchmarkSubcommand(CLISubcommand):
             )
             cmd_subparser.set_defaults(dispatch_function=cmd_cls.cmd)
             cmd_cls.add_cli_args(cmd_subparser)
+            show_filtered_argument_or_group_from_help(cmd_subparser,
+                                                      ["bench", cmd_cls.name])
+            cmd_subparser.epilog = VLLM_SUBCMD_PARSER_EPILOG
         return bench_parser
 
 
diff --git a/vllm/entrypoints/cli/run_batch.py b/vllm/entrypoints/cli/run_batch.py
index 61a34cbc3959f..86491678d7d24 100644
--- a/vllm/entrypoints/cli/run_batch.py
+++ b/vllm/entrypoints/cli/run_batch.py
@@ -60,7 +60,7 @@ class RunBatchSubcommand(CLISubcommand):
         )
         run_batch_parser = make_arg_parser(run_batch_parser)
         show_filtered_argument_or_group_from_help(run_batch_parser,
-                                                  "run-batch")
+                                                  ["run-batch"])
         run_batch_parser.epilog = VLLM_SUBCMD_PARSER_EPILOG
         return run_batch_parser
 
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index 23ac468ffa115..9e24b31e1aae6 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -97,7 +97,7 @@ class ServeSubcommand(CLISubcommand):
             "https://docs.vllm.ai/en/latest/configuration/serve_args.html")
 
         serve_parser = make_arg_parser(serve_parser)
-        show_filtered_argument_or_group_from_help(serve_parser, "serve")
+        show_filtered_argument_or_group_from_help(serve_parser, ["serve"])
         serve_parser.epilog = VLLM_SUBCMD_PARSER_EPILOG
         return serve_parser
 
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index 50f810afb8ccd..5b085e5b79478 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import argparse
 import asyncio
 import functools
 import os
@@ -15,8 +16,8 @@ from vllm.logger import init_logger
 logger = init_logger(__name__)
 
 VLLM_SUBCMD_PARSER_EPILOG = (
-    "Tip: Use `vllm [serve|run-batch] --help=<keyword>` "
-    "to explore arguments from help.\n"
+    "Tip: Use `vllm [serve|run-batch|bench <bench_type>] "
+    "--help=<keyword>` to explore arguments from help.\n"
     "   - To view a argument group:     --help=ModelConfig\n"
     "   - To view a single argument:    --help=max-num-seqs\n"
     "   - To search by keyword:         --help=max\n"
@@ -178,13 +179,19 @@ def _validate_truncation_size(
     return truncate_prompt_tokens
 
 
-def show_filtered_argument_or_group_from_help(parser, subcommand_name):
+def show_filtered_argument_or_group_from_help(parser: argparse.ArgumentParser,
+                                              subcommand_name: list[str]):
     import sys
 
     # Only handle --help=<keyword> for the current subcommand.
     # Since subparser_init() runs for all subcommands during CLI setup,
     # we skip processing if the subcommand name is not in sys.argv.
-    if subcommand_name not in sys.argv:
+    # sys.argv[0] is the program name. The subcommand follows.
+    # e.g., for `vllm bench latency`,
+    # sys.argv is `['vllm', 'bench', 'latency', ...]`
+    # and subcommand_name is "bench latency".
+    if len(sys.argv) <= len(subcommand_name) or sys.argv[
+            1:1 + len(subcommand_name)] != subcommand_name:
         return
 
     for arg in sys.argv:

From d1b689c44588a15f7a3d5812414b428963b2bb4b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 3 Jul 2025 16:46:24 +0200
Subject: [PATCH 076/167] [Bugfix] Fix flaky `test_streaming_response` test
 (#20363)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 requirements/common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 97a35e05d38ab..8bc0be7779aff 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -13,7 +13,7 @@ tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp
-openai >= 1.52.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
+openai >= 1.52.0, <= 1.90.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
 pydantic >= 2.10
 prometheus_client >= 0.18.0
 pillow  # Required for image processing

From 619b9f5c7e51fed230a0c9d2a1a5cca85d1651b9 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Thu, 3 Jul 2025 23:02:06 +0800
Subject: [PATCH 077/167] [Frontend] fix duplicate output for bench subcmd
 (#20446)

Signed-off-by: reidliu41 <reid201711@gmail.com>
---
 vllm/entrypoints/cli/benchmark/main.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/entrypoints/cli/benchmark/main.py b/vllm/entrypoints/cli/benchmark/main.py
index a9e703cf88890..87fb9f3514645 100644
--- a/vllm/entrypoints/cli/benchmark/main.py
+++ b/vllm/entrypoints/cli/benchmark/main.py
@@ -44,6 +44,7 @@ class BenchmarkSubcommand(CLISubcommand):
                 cmd_cls.name,
                 help=cmd_cls.help,
                 description=cmd_cls.help,
+                usage=f"vllm bench {cmd_cls.name} [options]",
             )
             cmd_subparser.set_defaults(dispatch_function=cmd_cls.cmd)
             cmd_cls.add_cli_args(cmd_subparser)

From 536fd330036b0406786c847f68e4f67cba06f421 Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Thu, 3 Jul 2025 10:21:31 -0500
Subject: [PATCH 078/167] [CI] Trimming some failing test groups from
 AMDPRODUCTION. (#20390)

---
 .buildkite/test-pipeline.yaml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 175269e857e01..148cf8074232f 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -217,7 +217,7 @@ steps:
 #####  1 GPU test  #####
 
 - label: Regression Test # 5min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
   - tests/test_regression
@@ -227,7 +227,7 @@ steps:
   working_dir: "/vllm-workspace/tests" # optional
 
 - label: Engine Test # 10min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
   - tests/engine
@@ -340,7 +340,7 @@ steps:
   parallelism: 4
 
 - label: PyTorch Compilation Unit Tests
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
     - vllm/
@@ -422,7 +422,7 @@ steps:
     - pytest -v -s kernels/mamba
 
 - label: Tensorizer Test # 11min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   soft_fail: true
   source_file_dependencies:
   - vllm/model_executor/model_loader
@@ -514,7 +514,7 @@ steps:
 #####  models test  #####
 
 - label: Basic Models Test # 24min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
   - vllm/
@@ -603,7 +603,7 @@ steps:
     - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
 
 - label: Multi-Modal Models Test (Extended) 3
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
   optional: true
   source_file_dependencies:
   - vllm/

From 71d6de3a26347333288ef92bf18d285f25b80ead Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 4 Jul 2025 04:01:47 +0800
Subject: [PATCH 079/167] [Misc] Clean up InternVL family config registration
 (#19992)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/transformers_utils/config.py           | 28 ++++++++---
 vllm/transformers_utils/configs/__init__.py |  4 --
 vllm/transformers_utils/configs/h2ovl.py    | 16 ------
 vllm/transformers_utils/configs/internvl.py | 54 ---------------------
 vllm/transformers_utils/configs/nvlm_d.py   | 20 +++++++-
 5 files changed, 40 insertions(+), 82 deletions(-)
 delete mode 100644 vllm/transformers_utils/configs/h2ovl.py
 delete mode 100644 vllm/transformers_utils/configs/internvl.py

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 701bb38810f6a..3cc112790013b 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -33,10 +33,8 @@ from vllm.logger import init_logger
 from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
                                              DbrxConfig, DeepseekVLV2Config,
                                              EAGLEConfig, ExaoneConfig,
-                                             H2OVLChatConfig,
-                                             InternVLChatConfig, JAISConfig,
-                                             KimiVLConfig, MedusaConfig,
-                                             MiniMaxText01Config,
+                                             JAISConfig, KimiVLConfig,
+                                             MedusaConfig, MiniMaxText01Config,
                                              MiniMaxVL01Config, MllamaConfig,
                                              MLPSpeculatorConfig, MPTConfig,
                                              NemotronConfig, NVLM_D_Config,
@@ -90,8 +88,6 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = {
     "medusa": MedusaConfig,
     "eagle": EAGLEConfig,
     "exaone": ExaoneConfig,
-    "h2ovl_chat": H2OVLChatConfig,
-    "internvl_chat": InternVLChatConfig,
     "minimax_text_01": MiniMaxText01Config,
     "minimax_vl_01": MiniMaxVL01Config,
     "nemotron": NemotronConfig,
@@ -104,6 +100,10 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = {
     **_CONFIG_REGISTRY_OVERRIDE_HF
 }
 
+_CONFIG_ATTRS_MAPPING: dict[str, str] = {
+    "llm_config": "text_config",
+}
+
 
 class ConfigFormat(str, enum.Enum):
     AUTO = "auto"
@@ -286,6 +286,18 @@ def is_encoder_decoder(config: PretrainedConfig) -> bool:
     return getattr(config, "is_encoder_decoder", False)
 
 
+def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig:
+    """Remap config attributes to match the expected names."""
+    for old_attr, new_attr in _CONFIG_ATTRS_MAPPING.items():
+        if hasattr(config, old_attr):
+            if not hasattr(config, new_attr):
+                config.update({new_attr: getattr(config, old_attr)})
+            delattr(config, old_attr)
+            logger.debug("Remapped config attribute '%s' to '%s'", old_attr,
+                         new_attr)
+    return config
+
+
 def get_config(
     model: Union[str, Path],
     trust_remote_code: bool,
@@ -361,6 +373,9 @@ def get_config(
                     revision=revision,
                     code_revision=code_revision,
                     token=_get_hf_token(),
+                    # some old custom model's config needs
+                    # `has_no_defaults_at_init=True` to work.
+                    has_no_defaults_at_init=trust_remote_code,
                     **kwargs,
                 )
             except ValueError as e:
@@ -376,6 +391,7 @@ def get_config(
                     raise RuntimeError(err_msg) from e
                 else:
                     raise e
+        config = _maybe_remap_hf_config_attrs(config)
 
     elif config_format == ConfigFormat.MISTRAL:
         config = load_params_config(model, revision, **kwargs)
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 97a1b683a9b83..734f1e09d0fd1 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -11,8 +11,6 @@ from vllm.transformers_utils.configs.exaone import ExaoneConfig
 # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 # `FalconConfig` class from the official HuggingFace transformers library.
 from vllm.transformers_utils.configs.falcon import RWConfig
-from vllm.transformers_utils.configs.h2ovl import H2OVLChatConfig
-from vllm.transformers_utils.configs.internvl import InternVLChatConfig
 from vllm.transformers_utils.configs.jais import JAISConfig
 from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
 from vllm.transformers_utils.configs.medusa import MedusaConfig
@@ -38,8 +36,6 @@ __all__ = [
     "DeepseekVLV2Config",
     "MPTConfig",
     "RWConfig",
-    "H2OVLChatConfig",
-    "InternVLChatConfig",
     "JAISConfig",
     "MedusaConfig",
     "EAGLEConfig",
diff --git a/vllm/transformers_utils/configs/h2ovl.py b/vllm/transformers_utils/configs/h2ovl.py
deleted file mode 100644
index b36a6dd59d3d3..0000000000000
--- a/vllm/transformers_utils/configs/h2ovl.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Adapted from
-# https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py
-# --------------------------------------------------------
-# H2OVL-Mississippi
-# Copyright (c) 2024 H2O.AI
-# Licensed under Apache 2.0 License [see LICENSE for details]
-# --------------------------------------------------------
-
-from .internvl import InternVLChatConfig
-
-
-class H2OVLChatConfig(InternVLChatConfig):
-    model_type = "h2ovl_chat"
diff --git a/vllm/transformers_utils/configs/internvl.py b/vllm/transformers_utils/configs/internvl.py
deleted file mode 100644
index 4494ebfef667f..0000000000000
--- a/vllm/transformers_utils/configs/internvl.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Adapted from
-# https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/configuration_internvl_chat.py
-# --------------------------------------------------------
-# InternVL
-# Copyright (c) 2024 OpenGVLab
-# Licensed under The MIT License [see LICENSE for details]
-# --------------------------------------------------------
-from transformers.configuration_utils import PretrainedConfig
-
-
-class InternVLChatConfig(PretrainedConfig):
-    model_type = 'internvl_chat'
-    is_composition = True
-
-    def __init__(self,
-                 vision_config=None,
-                 llm_config=None,
-                 use_backbone_lora=0,
-                 use_llm_lora=0,
-                 select_layer=-1,
-                 force_image_size=None,
-                 downsample_ratio=0.5,
-                 template=None,
-                 dynamic_image_size=False,
-                 use_thumbnail=False,
-                 ps_version='v1',
-                 min_dynamic_patch=1,
-                 max_dynamic_patch=6,
-                 **kwargs):
-        super().__init__(**kwargs)
-
-        if vision_config is None:
-            vision_config = {}
-
-        if llm_config is None:
-            llm_config = {}
-
-        self.vision_config = PretrainedConfig(**vision_config)
-        self.text_config = PretrainedConfig(**llm_config)
-
-        self.use_backbone_lora = use_backbone_lora
-        self.use_llm_lora = use_llm_lora
-        self.select_layer = select_layer
-        self.force_image_size = force_image_size
-        self.downsample_ratio = downsample_ratio
-        self.template = template
-        self.dynamic_image_size = dynamic_image_size
-        self.use_thumbnail = use_thumbnail
-        self.ps_version = ps_version  # pixel shuffle version
-        self.min_dynamic_patch = min_dynamic_patch
-        self.max_dynamic_patch = max_dynamic_patch
diff --git a/vllm/transformers_utils/configs/nvlm_d.py b/vllm/transformers_utils/configs/nvlm_d.py
index a533720af6c66..edfc506882ff5 100644
--- a/vllm/transformers_utils/configs/nvlm_d.py
+++ b/vllm/transformers_utils/configs/nvlm_d.py
@@ -8,8 +8,24 @@
 # Copyright (c) 2024 NVIDIA
 # Licensed under Apache 2.0 License [see LICENSE for details]
 # --------------------------------------------------------
-from .internvl import InternVLChatConfig
+from transformers import Qwen2Config
+from transformers.configuration_utils import PretrainedConfig
 
 
-class NVLM_D_Config(InternVLChatConfig):
+class NVLM_D_Config(PretrainedConfig):
     model_type = 'NVLM_D'
+    is_composition = True
+
+    def __init__(self, vision_config=None, llm_config=None, **kwargs):
+        super().__init__(**kwargs)
+
+        # Handle vision_config initialization
+        if vision_config is None:
+            vision_config = {}
+
+        # Handle llm_config initialization
+        if llm_config is None:
+            llm_config = {}
+
+        self.vision_config = PretrainedConfig(**vision_config)
+        self.text_config = Qwen2Config(**llm_config)

From 1dba2c4ebe2d955c1baf74037c6d51a8d4335731 Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Fri, 4 Jul 2025 04:27:17 +0800
Subject: [PATCH 080/167] [Misc] adjust for ipv6 for mookcacke url parse
 (#20107)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 tests/test_utils.py                           | 50 +++++++++++++++++--
 .../kv_transfer/kv_pipe/mooncake_pipe.py      | 48 +++++++++++-------
 vllm/utils/__init__.py                        | 28 +++++++++--
 3 files changed, 99 insertions(+), 27 deletions(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index 36db8202ba622..a165d2d7213a7 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -20,10 +20,11 @@ from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache,
                         MemorySnapshot, PlaceholderModule, StoreBoolean,
                         bind_kv_cache, common_broadcastable_dtype,
-                        deprecate_kwargs, get_open_port, is_lossless_cast,
-                        make_zmq_path, make_zmq_socket, memory_profiling,
-                        merge_async_iterators, sha256, split_zmq_path,
-                        supports_kw, swap_dict_values)
+                        deprecate_kwargs, get_open_port, get_tcp_uri,
+                        is_lossless_cast, join_host_port, make_zmq_path,
+                        make_zmq_socket, memory_profiling,
+                        merge_async_iterators, sha256, split_host_port,
+                        split_zmq_path, supports_kw, swap_dict_values)
 
 from .utils import create_new_process_for_each_test, error_on_warning
 
@@ -876,3 +877,44 @@ def test_make_zmq_socket_ipv6():
 def test_make_zmq_path():
     assert make_zmq_path("tcp", "127.0.0.1", "5555") == "tcp://127.0.0.1:5555"
     assert make_zmq_path("tcp", "::1", "5555") == "tcp://[::1]:5555"
+
+
+def test_get_tcp_uri():
+    assert get_tcp_uri("127.0.0.1", 5555) == "tcp://127.0.0.1:5555"
+    assert get_tcp_uri("::1", 5555) == "tcp://[::1]:5555"
+
+
+def test_split_host_port():
+    # valid ipv4
+    assert split_host_port("127.0.0.1:5555") == ("127.0.0.1", 5555)
+    # invalid ipv4
+    with pytest.raises(ValueError):
+        # multi colon
+        assert split_host_port("127.0.0.1::5555")
+    with pytest.raises(ValueError):
+        # tailing colon
+        assert split_host_port("127.0.0.1:5555:")
+    with pytest.raises(ValueError):
+        # no colon
+        assert split_host_port("127.0.0.15555")
+    with pytest.raises(ValueError):
+        # none int port
+        assert split_host_port("127.0.0.1:5555a")
+
+    # valid ipv6
+    assert split_host_port("[::1]:5555") == ("::1", 5555)
+    # invalid ipv6
+    with pytest.raises(ValueError):
+        # multi colon
+        assert split_host_port("[::1]::5555")
+    with pytest.raises(IndexError):
+        # no colon
+        assert split_host_port("[::1]5555")
+    with pytest.raises(ValueError):
+        # none int port
+        assert split_host_port("[::1]:5555a")
+
+
+def test_join_host_port():
+    assert join_host_port("127.0.0.1", 5555) == "127.0.0.1:5555"
+    assert join_host_port("::1", 5555) == "[::1]:5555"
diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
index 9f3494b8106e2..0b560d1b3b3ce 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
@@ -16,6 +16,7 @@ from safetensors.torch import save as safetensors_save
 from vllm.config import KVTransferConfig
 from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
 from vllm.logger import init_logger
+from vllm.utils import join_host_port, make_zmq_path, split_host_port
 
 logger = init_logger(__name__)
 NONE_INT = -150886311
@@ -79,18 +80,19 @@ class MooncakeTransferEngine:
             logger.error(
                 "An error occurred while loading the configuration: %s", exc)
             raise
-        prefill_host, base_prefill_port = self.config.prefill_url.split(':')
-        decode_host, base_decode_port = self.config.decode_url.split(':')
+        prefill_host, base_prefill_port = split_host_port(
+            self.config.prefill_url)
+        decode_host, base_decode_port = split_host_port(self.config.decode_url)
 
         # Avoid ports conflict when running prefill and decode on the same node
         if prefill_host == decode_host and \
                 base_prefill_port == base_decode_port:
-            base_decode_port = str(int(base_decode_port) + 100)
+            base_decode_port = base_decode_port + 100
 
-        prefill_port = int(base_prefill_port) + self.local_rank
-        decode_port = int(base_decode_port) + self.local_rank
-        self.prefill_url = ':'.join([prefill_host, str(prefill_port)])
-        self.decode_url = ':'.join([decode_host, str(decode_port)])
+        prefill_port = base_prefill_port + self.local_rank
+        decode_port = base_decode_port + self.local_rank
+        self.prefill_url = join_host_port(prefill_host, prefill_port)
+        self.decode_url = join_host_port(decode_host, decode_port)
 
         self.initialize(self.prefill_url if kv_rank == 0 else self.decode_url,
                         self.config.metadata_server, self.config.protocol,
@@ -110,22 +112,30 @@ class MooncakeTransferEngine:
         self._setup_metadata_sockets(kv_rank, prefill_host, base_prefill_port,
                                      decode_host, base_decode_port)
 
-    def _setup_metadata_sockets(self, kv_rank: int, p_host: str, p_port: str,
-                                d_host: str, d_port: str) -> None:
+    def _setup_metadata_sockets(self, kv_rank: int, p_host: str, p_port: int,
+                                d_host: str, d_port: int) -> None:
         """Set up ZeroMQ sockets for sending and receiving data."""
         # Offsets < 8 are left for initialization in case tp and pp are enabled
-        p_rank_offset = int(p_port) + 8 + self.local_rank * 2
-        d_rank_offset = int(d_port) + 8 + self.local_rank * 2
+        p_rank_offset = p_port + 8 + self.local_rank * 2
+        d_rank_offset = d_port + 8 + self.local_rank * 2
         if kv_rank == 0:
-            self.sender_socket.bind(f"tcp://{p_host}:{p_rank_offset + 1}")
-            self.receiver_socket.connect(f"tcp://{d_host}:{d_rank_offset + 1}")
-            self.sender_ack.connect(f"tcp://{d_host}:{d_rank_offset + 2}")
-            self.receiver_ack.bind(f"tcp://{p_host}:{p_rank_offset + 2}")
+            self.sender_socket.bind(
+                make_zmq_path("tcp", p_host, p_rank_offset + 1))
+            self.receiver_socket.connect(
+                make_zmq_path("tcp", d_host, d_rank_offset + 1))
+            self.sender_ack.connect(
+                make_zmq_path("tcp", d_host, d_rank_offset + 2))
+            self.receiver_ack.bind(
+                make_zmq_path("tcp", p_host, p_rank_offset + 2))
         else:
-            self.receiver_socket.connect(f"tcp://{p_host}:{p_rank_offset + 1}")
-            self.sender_socket.bind(f"tcp://{d_host}:{d_rank_offset + 1}")
-            self.receiver_ack.bind(f"tcp://{d_host}:{d_rank_offset + 2}")
-            self.sender_ack.connect(f"tcp://{p_host}:{p_rank_offset + 2}")
+            self.receiver_socket.connect(
+                make_zmq_path("tcp", p_host, p_rank_offset + 1))
+            self.sender_socket.bind(
+                make_zmq_path("tcp", d_host, d_rank_offset + 1))
+            self.receiver_ack.bind(
+                make_zmq_path("tcp", d_host, d_rank_offset + 2))
+            self.sender_ack.connect(
+                make_zmq_path("tcp", p_host, p_rank_offset + 2))
 
     def initialize(self, local_hostname: str, metadata_server: str,
                    protocol: str, device_name: str,
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index d97d873ccfdd3..ccfbf56927bdc 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -46,7 +46,7 @@ from dataclasses import dataclass, field
 from functools import cache, lru_cache, partial, wraps
 from types import MappingProxyType
 from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple,
-                    Optional, TypeVar, Union, cast, overload)
+                    Optional, Tuple, TypeVar, Union, cast, overload)
 from urllib.parse import urlparse
 from uuid import uuid4
 
@@ -628,14 +628,34 @@ def is_valid_ipv6_address(address: str) -> bool:
         return False
 
 
+def split_host_port(host_port: str) -> Tuple[str, int]:
+    # ipv6
+    if host_port.startswith('['):
+        host, port = host_port.rsplit(']', 1)
+        host = host[1:]
+        port = port.split(':')[1]
+        return host, int(port)
+    else:
+        host, port = host_port.split(':')
+        return host, int(port)
+
+
+def join_host_port(host: str, port: int) -> str:
+    if is_valid_ipv6_address(host):
+        return f"[{host}]:{port}"
+    else:
+        return f"{host}:{port}"
+
+
 def get_distributed_init_method(ip: str, port: int) -> str:
     return get_tcp_uri(ip, port)
 
 
 def get_tcp_uri(ip: str, port: int) -> str:
-    # Brackets are not permitted in ipv4 addresses,
-    # see https://github.com/python/cpython/issues/103848
-    return f"tcp://[{ip}]:{port}" if ":" in ip else f"tcp://{ip}:{port}"
+    if is_valid_ipv6_address(ip):
+        return f"tcp://[{ip}]:{port}"
+    else:
+        return f"tcp://{ip}:{port}"
 
 
 def get_open_zmq_ipc_path() -> str:

From 2f2fcb31b81f6025a2cc3cb9fe5b95bd03a5861b Mon Sep 17 00:00:00 2001
From: Yuxuan Zhang <2448370773@qq.com>
Date: Fri, 4 Jul 2025 05:41:13 +0800
Subject: [PATCH 081/167] [Misc] Remove _maybe_ignore_quant_config from GLM4.1v
 (#20432)

Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
---
 vllm/model_executor/models/glm4_1v.py | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 303cbdb25945b..298700e23e507 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -55,9 +55,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.quantization.gptq import GPTQConfig
-from vllm.model_executor.layers.quantization.gptq_marlin import (
-    GPTQMarlinConfig)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -179,6 +176,7 @@ class Glm4vVisionMLP(nn.Module):
         hidden_features: int,
         bias: bool = False,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
@@ -186,13 +184,12 @@ class Glm4vVisionMLP(nn.Module):
             output_sizes=[hidden_features] * 2,
             bias=bias,
             quant_config=quant_config,
-        )
-        self.down_proj = RowParallelLinear(
-            hidden_features,
-            in_features,
-            bias=bias,
-            quant_config=quant_config,
-        )
+            prefix=f"{prefix}.gate_up_proj")
+        self.down_proj = RowParallelLinear(hidden_features,
+                                           in_features,
+                                           bias=bias,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.down_proj")
         self.act_fn = SiluAndMul()
 
     def forward(self, x: torch.Tensor):
@@ -407,6 +404,7 @@ class Glm4vVisionBlock(nn.Module):
             mlp_hidden_dim,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
         )
 
     def forward(
@@ -1278,7 +1276,7 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.visual = Glm4vVisionTransformer(
             config.vision_config,
             norm_eps=getattr(config, "rms_norm_eps", 1e-5),
-            quant_config=self._maybe_ignore_quant_config(quant_config),
+            quant_config=quant_config,
             prefix=maybe_prefix(prefix, "visual"),
         )
 
@@ -1291,13 +1289,6 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
-        # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
-        # seems to avoid vision encoder sections for some models.
-        if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
-            return None
-        return quant_config
-
     def _validate_and_reshape_mm_tensor(self, mm_input: object,
                                         name: str) -> torch.Tensor:
         if not isinstance(mm_input, (torch.Tensor, list)):

From 78fe77534bd87f5356881f12d2a4ecbd8db29c41 Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Thu, 3 Jul 2025 17:55:40 -0400
Subject: [PATCH 082/167] [Kernel] Enable fp8 support for pplx and
 BatchedTritonExperts. (#18864)

Signed-off-by: Bill Nell <bnell@redhat.com>
---
 tests/kernels/moe/parallel_utils.py           |  10 +-
 tests/kernels/moe/test_batched_moe.py         | 104 ++-
 tests/kernels/moe/test_deepep_deepgemm_moe.py |   3 +-
 tests/kernels/moe/test_deepep_moe.py          |   5 +-
 tests/kernels/moe/test_pplx_cutlass_moe.py    |  71 +-
 tests/kernels/moe/test_pplx_moe.py            | 714 ++++++++++++------
 tests/kernels/moe/utils.py                    |  14 +-
 tests/kernels/quant_utils.py                  |  18 +
 tests/kernels/utils.py                        |  10 +-
 .../layers/fused_moe/batched_deep_gemm_moe.py |  14 +-
 .../batched_triton_or_deep_gemm_moe.py        |  34 +-
 .../model_executor/layers/fused_moe/config.py |  64 +-
 .../layers/fused_moe/cutlass_moe.py           |  18 +-
 .../fused_moe/deepep_ht_prepare_finalize.py   |  23 +-
 .../fused_moe/deepep_ll_prepare_finalize.py   |  15 +-
 .../layers/fused_moe/fused_batched_moe.py     | 591 ++++++++++-----
 .../layers/fused_moe/fused_moe.py             |   2 +
 vllm/model_executor/layers/fused_moe/layer.py |  34 +-
 .../layers/fused_moe/modular_kernel.py        |   4 +
 .../layers/fused_moe/pplx_prepare_finalize.py |  97 ++-
 .../layers/fused_moe/prepare_finalize.py      |   3 +
 vllm/model_executor/layers/fused_moe/utils.py |  37 +-
 .../compressed_tensors_moe.py                 |  48 +-
 .../model_executor/layers/quantization/fp8.py |   5 +-
 vllm/model_executor/models/qwen3_moe.py       |   2 +-
 25 files changed, 1277 insertions(+), 663 deletions(-)

diff --git a/tests/kernels/moe/parallel_utils.py b/tests/kernels/moe/parallel_utils.py
index 7797e4f0c9c04..98ae4c8cd34ef 100644
--- a/tests/kernels/moe/parallel_utils.py
+++ b/tests/kernels/moe/parallel_utils.py
@@ -137,8 +137,7 @@ def make_deepep_ht_a2a(pg: ProcessGroup,
                             low_latency_mode=low_latency_mode,
                             num_qps_per_rank=num_qps_per_rank)
     return DeepEPHTPrepareAndFinalize(buffer=buffer,
-                                      world_size=pgi.world_size,
-                                      rank=pgi.rank,
+                                      num_dispatchers=pgi.world_size,
                                       dp_size=dp_size,
                                       rank_expert_offset=pgi.rank *
                                       ht_args.num_local_experts)
@@ -146,7 +145,6 @@ def make_deepep_ht_a2a(pg: ProcessGroup,
 
 def make_deepep_ll_a2a(pg: ProcessGroup,
                        pgi: ProcessGroupInfo,
-                       dp_size: int,
                        deepep_ll_args: DeepEPLLArgs,
                        q_dtype: Optional[torch.dtype] = None,
                        block_shape: Optional[list[int]] = None):
@@ -166,8 +164,7 @@ def make_deepep_ll_a2a(pg: ProcessGroup,
 
     return DeepEPLLPrepareAndFinalize(
         buffer=buffer,
-        world_size=pgi.world_size,
-        dp_size=dp_size,
+        num_dispatchers=pgi.world_size,
         max_tokens_per_rank=deepep_ll_args.max_tokens_per_rank,
         use_fp8_dispatch=deepep_ll_args.use_fp8_dispatch,
     )
@@ -186,5 +183,4 @@ def make_deepep_a2a(pg: ProcessGroup,
                                   block_shape)
 
     assert deepep_ll_args is not None
-    return make_deepep_ll_a2a(pg, pgi, dp_size, deepep_ll_args, q_dtype,
-                              block_shape)
+    return make_deepep_ll_a2a(pg, pgi, deepep_ll_args, q_dtype, block_shape)
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
index 779fa1df086d4..c9a4375ac939e 100644
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -10,7 +10,7 @@ import triton.language as tl
 
 from tests.kernels.moe.utils import (batched_moe,
                                      make_quantized_test_activations,
-                                     make_test_weights, triton_moe)
+                                     make_test_weights, naive_batched_moe)
 from tests.kernels.quant_utils import native_batched_masked_quant_matmul
 from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig, set_current_vllm_config
@@ -33,12 +33,10 @@ MNK_FACTORS = [
     (45, 512, 512),
     (45, 1024, 128),
     (45, 1024, 2048),
-    (64, 128, 128),
     (64, 512, 512),
     (64, 1024, 2048),
     (222, 128, 128),
     (222, 128, 2048),
-    (222, 512, 512),
     (222, 1024, 128),
     (222, 1024, 2048),
 ]
@@ -95,11 +93,12 @@ class BatchedMMTensors:
 @pytest.mark.parametrize("max_tokens_per_expert",
                          [32, 64, 128, 192, 224, 256, 512])
 @pytest.mark.parametrize("K", [128, 256, 1024])
-@pytest.mark.parametrize("N", [128, 256, 512, 1024])
-@pytest.mark.parametrize("dtype",
-                         [torch.float32, torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("block_shape", [None])
-@pytest.mark.parametrize("per_act_token_quant", [False])
+@pytest.mark.parametrize("N", [128, 256, 1024])
+@pytest.mark.parametrize(
+    "dtype",
+    [torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("block_shape", [None, [128, 128]])
+@pytest.mark.parametrize("per_act_token_quant", [False, True])
 def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
                     N: int, dtype: torch.dtype,
                     block_shape: Optional[list[int]],
@@ -134,7 +133,8 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         in_dtype=act_dtype,
         quant_dtype=quant_dtype,
         block_shape=block_shape,
-        per_act_token_quant=per_act_token_quant)
+        per_act_token_quant=per_act_token_quant,
+    )
 
     B, B_q, B_scale, _, _, _ = make_test_weights(
         num_experts,
@@ -143,6 +143,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         in_dtype=act_dtype,
         quant_dtype=quant_dtype,
         block_shape=block_shape,
+        per_act_token_quant=per_act_token_quant,
     )
 
     out_shape = (num_experts, max_tokens_per_expert, N)
@@ -177,6 +178,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
             "BLOCK_SIZE_N": 16,
             "BLOCK_SIZE_K": 16 if dtype.itemsize > 1 else 32
         },
+        per_act_token_quant=per_act_token_quant,
         block_shape=block_shape,
     )
 
@@ -185,15 +187,13 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         B,
         ref_output,
         num_expert_tokens,
-        None,
-        None,
-        None,
     )
 
     q_ref_output = native_batched_masked_quant_matmul(A_q, B_q, q_ref_output,
                                                       num_expert_tokens,
                                                       A_scale, B_scale,
-                                                      block_shape)
+                                                      block_shape,
+                                                      per_act_token_quant)
 
     rtol, atol = {
         torch.float16: (6e-2, 6e-2),
@@ -201,16 +201,17 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         torch.float32: (1e-2, 1e-2),
     }[test_output.dtype]
 
-    torch.testing.assert_close(ref_output, test_output, atol=atol, rtol=rtol)
+    torch.testing.assert_close(ref_output, q_ref_output, atol=atol, rtol=rtol)
     torch.testing.assert_close(test_output, q_ref_output, atol=atol, rtol=rtol)
 
 
 @pytest.mark.parametrize(("m", "n", "k"), MNK_FACTORS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
-@pytest.mark.parametrize("per_act_token_quant", [False])
-@pytest.mark.parametrize("block_shape", [None])
+@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
+@pytest.mark.parametrize("per_act_token_quant", [False, True])
+@pytest.mark.parametrize("block_shape", [None, [128, 128]])
+@pytest.mark.parametrize("input_scales", [False])
 def test_fused_moe_batched_experts(
     m: int,
     n: int,
@@ -220,15 +221,19 @@ def test_fused_moe_batched_experts(
     dtype: torch.dtype,
     per_act_token_quant: bool,
     block_shape: Optional[list[int]],
+    input_scales: bool,
 ):
     current_platform.seed_everything(7)
 
     use_fp8_w8a8 = dtype == torch.float8_e4m3fn
 
+    if topk > e:
+        pytest.skip("topk > e")
+
     if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
         pytest.skip("Skip quantization test for non-quantized type")
 
-    if per_act_token_quant and block_shape is not None or topk > e:
+    if per_act_token_quant and block_shape is not None:
         pytest.skip("Skip illegal quantization test.")
 
     a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
@@ -241,27 +246,26 @@ def test_fused_moe_batched_experts(
         act_dtype = dtype
         quant_dtype = None
 
-    _, w1, w1_s, _, w2, w2_s = make_test_weights(e,
-                                                 n,
-                                                 k,
-                                                 block_shape=block_shape,
-                                                 in_dtype=act_dtype,
-                                                 quant_dtype=quant_dtype)
+    w1_16, w1, w1_s, w2_16, w2, w2_s = make_test_weights(
+        e,
+        n,
+        k,
+        block_shape=block_shape,
+        in_dtype=act_dtype,
+        quant_dtype=quant_dtype,
+        per_act_token_quant=per_act_token_quant,
+    )
+
+    if input_scales and quant_dtype is not None:
+        a1_scale = torch.tensor(1, device="cuda", dtype=torch.float32)
+        a2_scale = torch.tensor(1, device="cuda", dtype=torch.float32)
+    else:
+        a1_scale = None
+        a2_scale = None
 
     with set_current_vllm_config(vllm_config):
         topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-        batched_output = batched_moe(
-            a,
-            w1,
-            w2,
-            topk_weight,
-            topk_ids,
-            w1_scale=w1_s,
-            w2_scale=w2_s,
-            quant_dtype=quant_dtype,
-            per_act_token_quant=per_act_token_quant,
-            block_shape=block_shape,
-        )
+
         baseline_output = torch_experts(
             a,
             w1,
@@ -270,11 +274,14 @@ def test_fused_moe_batched_experts(
             topk_ids,
             w1_scale=w1_s,
             w2_scale=w2_s,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
             quant_dtype=quant_dtype,
             per_act_token_quant=per_act_token_quant,
-            block_shape=block_shape)
+            block_shape=block_shape,
+        )
 
-        triton_output = triton_moe(
+        batched_output = naive_batched_moe(
             a,
             w1,
             w2,
@@ -282,14 +289,31 @@ def test_fused_moe_batched_experts(
             topk_ids,
             w1_scale=w1_s,
             w2_scale=w2_s,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
             quant_dtype=quant_dtype,
             per_act_token_quant=per_act_token_quant,
             block_shape=block_shape,
         )
 
-    torch.testing.assert_close(triton_output,
+        triton_output = batched_moe(
+            a,
+            w1,
+            w2,
+            topk_weight,
+            topk_ids,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            quant_dtype=quant_dtype,
+            per_act_token_quant=per_act_token_quant,
+            block_shape=block_shape,
+        )
+
+    torch.testing.assert_close(batched_output,
                                baseline_output,
-                               atol=2e-2,
+                               atol=3e-2,
                                rtol=2e-2)
 
     torch.testing.assert_close(triton_output,
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index 9b861d4ebc236..23eb5fcc94538 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -148,8 +148,7 @@ def make_ll_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,
 
     fused_experts = BatchedDeepGemmExperts(
         max_num_tokens=max_tokens_per_rank,
-        world_size=pgi.world_size,
-        dp_size=dp_size,
+        num_dispatchers=pgi.world_size // dp_size,
         block_shape=test_config.block_size,
         per_act_token_quant=test_config.per_act_token_quant)
     mk = FusedMoEModularKernel(prepare_finalize=a2a,
diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py
index d7df5bf770353..6446a8d9503e8 100644
--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@@ -154,12 +154,13 @@ def make_modular_kernel(
                         deepep_ht_args = ht_args,
                         deepep_ll_args = ll_args)
 
+    num_dispatchers = pgi.world_size // dp_size
+
     if low_latency_mode:
         assert not per_act_token_quant, "not supported in ll mode"
         fused_experts = BatchedTritonExperts(
             max_num_tokens=MAX_TOKENS_PER_RANK,
-            world_size=pgi.world_size,
-            dp_size=dp_size,
+            num_dispatchers=num_dispatchers,
             use_fp8_w8a8=is_quantized,
             use_int8_w8a8=False,
             use_int8_w8a16=False,
diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py
index 184c2dd2f9047..e4f4a393dfd56 100644
--- a/tests/kernels/moe/test_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -14,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEModularKernel)
 from vllm.platforms import current_platform
+from vllm.utils import cdiv
 
 from .parallel_utils import ProcessGroupInfo, parallel_launch
 
@@ -112,18 +113,21 @@ def pplx_cutlass_moe(
     w2_scale = w2_scale.to(device)
     a1_scale = a1_scale.to(device)
 
+    assert num_experts % world_size == 0
+    num_local_experts = cdiv(num_experts, world_size)
+    num_dispatchers = pgi.world_size // dp_size
+
     prepare_finalize = PplxPrepareAndFinalize(
         ata,
-        max_num_tokens,
-        pgi.world_size,
-        rank,
-        dp_size,
-    )
+        max_num_tokens=max_num_tokens,
+        num_local_experts=num_local_experts,
+        num_dispatchers=num_dispatchers)
 
-    experts = CutlassExpertsFp8((num_experts + world_size - 1) // world_size,
+    experts = CutlassExpertsFp8(num_local_experts,
                                 out_dtype,
                                 per_act_token,
                                 per_out_ch,
+                                num_dispatchers=num_dispatchers,
                                 use_batched_format=True)
 
     fused_cutlass_experts = FusedMoEModularKernel(
@@ -181,35 +185,40 @@ def _pplx_moe(
     per_out_ch: bool,
     use_internode: bool,
 ):
-    if use_internode:
-        uid = nvshmem_get_unique_id(
-        ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
-        torch.distributed.broadcast(uid, src=0)
-        nvshmem_init(uid, pgi.rank, pgi.world_size)
-    else:
-        group_ranks = list(range(pgi.world_size))
-        cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
-        group_name = cpu_group.group_name
+    try:
+        if use_internode:
+            uid = nvshmem_get_unique_id(
+            ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+            torch.distributed.broadcast(uid, src=0)
+            nvshmem_init(uid, pgi.rank, pgi.world_size)
+        else:
+            group_ranks = list(range(pgi.world_size))
+            cpu_group = torch.distributed.new_group(group_ranks,
+                                                    backend="gloo")
+            group_name = cpu_group.group_name
 
-    with set_current_vllm_config(vllm_config):
-        torch_output = torch_experts(a_full, w1_full, w2_full, topk_weights,
-                                     topk_ids)
-        pplx_output = pplx_cutlass_moe(pgi, dp_size, a, w1, w2, w1_scale,
-                                       w2_scale, topk_weights, topk_ids,
-                                       a1_scale, out_dtype, per_act_token,
-                                       per_out_ch, group_name)
+        with set_current_vllm_config(vllm_config):
+            torch_output = torch_experts(a_full, w1_full, w2_full,
+                                         topk_weights, topk_ids)
+            pplx_output = pplx_cutlass_moe(pgi, dp_size, a, w1, w2, w1_scale,
+                                           w2_scale, topk_weights, topk_ids,
+                                           a1_scale, out_dtype, per_act_token,
+                                           per_out_ch, group_name)
 
-        torch_output = chunk_by_rank(torch_output, pgi.rank,
-                                     pgi.world_size).to(pplx_output.device)
+            torch_output = chunk_by_rank(torch_output, pgi.rank,
+                                         pgi.world_size).to(pplx_output.device)
 
-    # Uncomment if more debugging is needed
-    # print("PPLX OUT:", pplx_output)
-    # print("TORCH OUT:", torch_output)
+        # Uncomment if more debugging is needed
+        # print("PPLX OUT:", pplx_output)
+        # print("TORCH OUT:", torch_output)
 
-    torch.testing.assert_close(pplx_output, torch_output, atol=0.05, rtol=0)
-
-    if use_internode:
-        nvshmem_finalize()
+        torch.testing.assert_close(pplx_output,
+                                   torch_output,
+                                   atol=0.05,
+                                   rtol=0)
+    finally:
+        if use_internode:
+            nvshmem_finalize()
 
 
 @pytest.mark.parametrize("m", [2, 224])
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index 186e00800a17d..d28e0e040629d 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -4,7 +4,10 @@
 
 Run `pytest tests/kernels/test_pplx_moe.py`.
 """
-from typing import Optional
+import itertools
+import textwrap
+import traceback
+from typing import Callable, Optional
 
 import pytest
 import torch
@@ -19,12 +22,13 @@ except ImportError:
     has_pplx = False
 
 from tests.kernels.moe.utils import make_test_weights, naive_batched_moe
+from tests.kernels.quant_utils import dequant
 from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe import fused_topk, override_config
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts)
+    BatchedTritonExperts)
 from vllm.model_executor.layers.fused_moe.fused_moe import get_default_config
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEModularKernel)
@@ -38,22 +42,22 @@ requires_pplx = pytest.mark.skipif(
     reason="Requires PPLX kernels",
 )
 
-PPLX_PREPARE_COMBOS = [(4, 128, 128), (32, 1024, 512), (64, 1024, 512),
-                       (222, 2048, 1024)]
-
-PPLX_MOE_COMBOS = [
-    (1, 128, 128),
+PPLX_COMBOS = [
+    # TODO: figure out why this fails, seems to be test problem
+    #(1, 128, 128),
     (2, 128, 512),
     (3, 1024, 2048),
-    (32, 128, 1024),
+    (4, 128, 128),
+    (32, 1024, 512),
     (45, 512, 2048),
-    (64, 1024, 1024),
-    (222, 1024, 2048),
+    (64, 1024, 512),
+    (222, 2048, 1024),
+    (256, 1408, 2048),
 ]
 
 NUM_EXPERTS = [8, 64]
-EP_SIZE = [1, 4]
 TOP_KS = [1, 2, 6]
+DTYPES = [torch.float8_e4m3fn, torch.bfloat16]
 
 vllm_config = VllmConfig()
 vllm_config.scheduler_config.max_num_seqs = 128
@@ -169,9 +173,11 @@ def test_fused_moe_batched_experts(
 
     with set_current_vllm_config(vllm_config):
         topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-        baseline_output = torch_experts(a, w1, w2, topk_weight, topk_ids)
+        baseline_output = torch_experts(a, w1, w2, topk_weight,
+                                        topk_ids)  # only for baseline
         torch_output = torch_batched_moe(a, w1, w2, topk_weight, topk_ids)
-        batched_output = naive_batched_moe(a, w1, w2, topk_weight, topk_ids)
+        batched_output = naive_batched_moe(
+            a, w1, w2, topk_weight, topk_ids)  # pick torch_experts or this
 
     torch.testing.assert_close(baseline_output,
                                torch_output,
@@ -183,6 +189,63 @@ def test_fused_moe_batched_experts(
                                rtol=0)
 
 
+def create_pplx_prepare_finalize(
+    num_tokens: int,
+    hidden_dim: int,
+    topk: int,
+    num_experts: int,
+    rank: int,
+    dp_size: int,
+    world_size: int,
+    in_dtype: torch.dtype,
+    quant_dtype: Optional[torch.dtype],
+    block_shape: Optional[list[int]],
+    per_act_token_quant: bool,
+    group_name: Optional[str],
+):
+    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
+        PplxPrepareAndFinalize, pplx_hidden_dim_scale_bytes)
+
+    max_num_tokens = max(rank_chunk(num_tokens, 0, world_size), 1)
+    num_local_experts = rank_chunk(num_experts, 0, world_size)
+
+    hidden_dim_bytes, scale_bytes = pplx_hidden_dim_scale_bytes(
+        max_num_tokens,
+        hidden_dim,
+        in_dtype,
+        quant_dtype,
+        per_act_token_quant=per_act_token_quant,
+        block_shape=block_shape,
+    )
+
+    args = dict(
+        max_num_tokens=max_num_tokens,
+        num_experts=num_experts,
+        experts_per_token=topk,
+        rank=rank,
+        world_size=world_size,
+        dp_size=dp_size,
+        hidden_dim=hidden_dim,
+        hidden_dim_bytes=hidden_dim_bytes,
+        hidden_dim_scale_bytes=scale_bytes,
+    )
+
+    if group_name is None:
+        ata = AllToAll.internode(**args)
+    else:
+        args["group_name"] = group_name
+        ata = AllToAll.intranode(**args)
+
+    prepare_finalize = PplxPrepareAndFinalize(
+        ata,
+        max_num_tokens=max_num_tokens,
+        num_local_experts=num_local_experts,
+        num_dispatchers=world_size // dp_size,
+    )
+
+    return prepare_finalize, ata
+
+
 def rank_chunk(num: int, r: int, w: int) -> int:
     rem = num % w
     return (num // w) + (1 if r < rem else 0)
@@ -193,6 +256,35 @@ def chunk_by_rank(t: torch.Tensor, r: int, w: int) -> torch.Tensor:
     return t[(r * chunk):(r + 1) * chunk]
 
 
+def maybe_chunk_by_rank(t: Optional[torch.Tensor], r: int,
+                        w: int) -> Optional[torch.Tensor]:
+    if t is not None:
+        return chunk_by_rank(t, r, w)
+    else:
+        return t
+
+
+def chunk_scales_by_rank(t: Optional[torch.Tensor], r: int,
+                         w: int) -> Optional[torch.Tensor]:
+    if t is not None and t.numel() > 1:
+        chunk = rank_chunk(t.shape[0], r, w)
+        return t[(r * chunk):(r + 1) * chunk]
+    else:
+        return t
+
+
+def chunk_scales(t: Optional[torch.Tensor], start: int,
+                 end: int) -> Optional[torch.Tensor]:
+    if t is not None and t.numel() > 1:
+        return t[start:end]
+    else:
+        return t
+
+
+def dummy_work(a: torch.Tensor) -> torch.Tensor:
+    return a * 1.1
+
+
 def pplx_prepare_finalize(
     pgi: ProcessGroupInfo,
     dp_size: int,
@@ -200,11 +292,11 @@ def pplx_prepare_finalize(
     topk_weight: torch.Tensor,
     topk_ids: torch.Tensor,
     num_experts: int,
+    quant_dtype: Optional[torch.dtype],
+    block_shape: Optional[list[int]],
+    per_act_token_quant: bool,
     group_name: Optional[str],
 ) -> torch.Tensor:
-    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
-        PplxPrepareAndFinalize)
-
     assert torch.cuda.current_device() == pgi.local_rank
 
     topk = topk_ids.shape[1]
@@ -212,60 +304,66 @@ def pplx_prepare_finalize(
     device = pgi.device
     rank = pgi.rank
     world_size = pgi.world_size
-    max_num_tokens = rank_chunk(num_tokens, 0, world_size)
-
-    args = dict(
-        max_num_tokens=max_num_tokens,
-        num_experts=num_experts,
-        experts_per_token=topk,
-        rank=rank,
-        world_size=world_size,
-        dp_size=dp_size,
-        hidden_dim=hidden_dim,
-        hidden_dim_bytes=hidden_dim * a.dtype.itemsize,
-        hidden_dim_scale_bytes=0,
-    )
-
-    if group_name is None:
-        ata = AllToAll.internode(**args)
-    else:
-        args["group_name"] = group_name
-        ata = AllToAll.intranode(**args)
 
     topk_ids = topk_ids.to(dtype=torch.uint32)
 
-    prepare_finalize = PplxPrepareAndFinalize(
-        ata,
-        max_num_tokens,
-        world_size,
+    prepare_finalize, ata = create_pplx_prepare_finalize(
+        num_tokens,
+        hidden_dim,
+        topk,
+        num_experts,
         rank,
         dp_size,
+        world_size,
+        a.dtype,
+        quant_dtype,
+        block_shape,
+        per_act_token_quant,
+        group_name,
     )
 
+    assert a.shape[0] == topk_ids.shape[0]
+
     a_chunk = chunk_by_rank(a, rank, world_size).to(device)
     chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
     chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
 
+    assert a_chunk.shape[0] == chunk_topk_ids.shape[0]
+
+    out = torch.full(
+        a_chunk.shape,
+        torch.nan,
+        dtype=a.dtype,
+        device=device,
+    )
+
+    if (quant_dtype is not None and not per_act_token_quant
+            and block_shape is None):
+        a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    else:
+        a1_scale = None
+        a2_scale = None
+
     b_a, b_a_scale, expert_num_tokens, _, _ = prepare_finalize.prepare(
         a_chunk,
-        None,
-        None,
+        a1_scale,
+        a2_scale,
         chunk_topk_weight,
         chunk_topk_ids,
         num_experts,
         None,
         False,
-        FusedMoEQuantConfig(),
+        FusedMoEQuantConfig(
+            quant_dtype,
+            per_act_token_quant,
+            False,
+            block_shape,
+        ),
     )
 
-    b_a = b_a * 1.5
-
-    out = torch.full(
-        (max_num_tokens, hidden_dim),
-        torch.nan,
-        dtype=a.dtype,
-        device=device,
-    )
+    b_a = dummy_work(
+        dequant(b_a, b_a_scale, block_shape, per_act_token_quant, a.dtype))
 
     prepare_finalize.finalize(
         out,
@@ -291,70 +389,96 @@ def _pplx_prepare_finalize(
     score: torch.Tensor,
     topk: torch.Tensor,
     num_experts: int,
+    quant_dtype: Optional[torch.dtype],
+    block_shape: Optional[list[int]],
+    per_act_token_quant: bool,
     use_internode: bool,
 ):
-    if use_internode:
-        uid = nvshmem_get_unique_id(
-        ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
-        torch.distributed.broadcast(uid, src=0)
-        nvshmem_init(uid, pgi.rank, pgi.world_size)
-        group_name = None
-    else:
-        group_ranks = list(range(pgi.world_size))
-        cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
-        group_name = cpu_group.group_name
+    try:
+        if use_internode:
+            uid = nvshmem_get_unique_id(
+            ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+            torch.distributed.broadcast(uid, src=0)
+            nvshmem_init(uid, pgi.rank, pgi.world_size)
+            group_name = None
+        else:
+            group_ranks = list(range(pgi.world_size))
+            cpu_group = torch.distributed.new_group(group_ranks,
+                                                    backend="gloo")
+            group_name = cpu_group.group_name
 
-    device = pgi.device
+        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
+        m, k = a.shape
 
-    topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-    k = a.shape[1]
+        a_rep = torch.repeat_interleave(dummy_work(a), topk, dim=0)
 
-    a_rep = torch.repeat_interleave(a, topk, dim=0).to(device)
+        torch_output = (a_rep.view(m, topk, k) *
+                        topk_weight.view(m, topk, 1).to(a_rep.dtype)).sum(
+                            dim=1)
 
-    torch_output = (a_rep.view(-1, topk, k) * 1.5 *
-                    topk_weight.view(-1, topk, 1).to(device)).sum(dim=1).to(
-                        a.dtype)
+        pplx_output = pplx_prepare_finalize(pgi, dp_size, a, topk_weight,
+                                            topk_ids, num_experts, quant_dtype,
+                                            block_shape, per_act_token_quant,
+                                            group_name)
 
-    pplx_output = pplx_prepare_finalize(pgi, dp_size, a, topk_weight, topk_ids,
-                                        num_experts, group_name)
+        torch_output = chunk_by_rank(torch_output, pgi.rank,
+                                     pgi.world_size).to(pgi.device)
 
-    torch_output = chunk_by_rank(torch_output, pgi.rank,
-                                 pgi.world_size).to(pplx_output.device)
-
-    torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0)
-
-    if use_internode:
-        nvshmem_finalize()
+        torch.testing.assert_close(pplx_output,
+                                   torch_output,
+                                   atol=3e-2,
+                                   rtol=3e-2)
+    finally:
+        if use_internode:
+            nvshmem_finalize()
 
 
-# TODO (bnell): this test point does not work for odd M due to how the test is
-# written, not due to limitations of the pplx kernels.  The pplx_moe
-# test below is able to deal with odd M.
-# TODO (bnell) add fp8 tests
-@pytest.mark.parametrize("mnk", PPLX_PREPARE_COMBOS)
+@pytest.mark.parametrize("mnk", PPLX_COMBOS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])
+@pytest.mark.parametrize("per_act_token_quant", [False, True])
+@pytest.mark.parametrize("block_shape", [None, [128, 128]])
 @pytest.mark.parametrize("use_internode", [False])
+@pytest.mark.optional
 @requires_pplx
-def test_pplx_prepare_finalize(
+def test_pplx_prepare_finalize_slow(
     mnk: tuple[int, int, int],
     e: int,
     topk: int,
     dtype: torch.dtype,
     world_dp_size: tuple[int, int],
+    per_act_token_quant: bool,
+    block_shape: Optional[list[int]],
     use_internode: bool,
 ):
+    if dtype == torch.float8_e4m3fn:
+        use_fp8_w8a8 = True
+        act_dtype = torch.bfloat16
+        quant_dtype = dtype
+    else:
+        use_fp8_w8a8 = False
+        act_dtype = dtype
+        quant_dtype = None
+
+    if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
+        pytest.skip("Skip quantization test for non-quantized type")
+
+    if per_act_token_quant and block_shape is not None:
+        pytest.skip("Skip illegal quantization combination")
+
     current_platform.seed_everything(7)
     m, n, k = mnk
     world_size, dp_size = world_dp_size
     device = "cuda"
-    a = torch.randn((m, k), device=device, dtype=dtype) / 10
-    score = torch.randn((m, e), device=device, dtype=dtype)
+
+    a = torch.randn((m, k), device=device, dtype=act_dtype) / 10
+    score = torch.randn((m, e), device=device, dtype=act_dtype)
 
     parallel_launch(world_size, _pplx_prepare_finalize, dp_size, a, score,
-                    topk, e, use_internode)
+                    topk, e, quant_dtype, block_shape, per_act_token_quant,
+                    use_internode)
 
 
 def pplx_moe(
@@ -369,84 +493,62 @@ def pplx_moe(
     topk_ids: torch.Tensor,
     w1_scale: Optional[torch.Tensor] = None,
     w2_scale: Optional[torch.Tensor] = None,
-    qtype: Optional[torch.dtype] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    quant_dtype: Optional[torch.dtype] = None,
     per_act_token_quant=False,
     block_shape: Optional[list[int]] = None,
     use_compile: bool = False,
     use_cudagraphs: bool = True,
 ) -> torch.Tensor:
-    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
-        PplxPrepareAndFinalize, pplx_hidden_dim_scale_bytes)
 
-    device = torch.device("cuda", rank)
-    hidden_dim = a.shape[1]
+    num_tokens, hidden_dim = a.shape
     num_experts = w1.shape[0]
     topk = topk_ids.shape[1]
-    max_num_tokens = round_up(rank_chunk(a.shape[0], 0, world_size), 64)
+    max_num_tokens = round_up(rank_chunk(a.shape[0], 0, world_size), 16)
 
-    hidden_dim_bytes, scale_bytes = pplx_hidden_dim_scale_bytes(
-        max_num_tokens,
+    prepare_finalize, ata = create_pplx_prepare_finalize(
+        num_tokens,
         hidden_dim,
+        topk,
+        num_experts,
+        rank,
+        dp_size,
+        world_size,
         a.dtype,
-        qtype,
-        per_act_token_quant=per_act_token_quant,
-        block_shape=block_shape,
+        quant_dtype,
+        block_shape,
+        per_act_token_quant,
+        group_name,
     )
 
-    args = dict(
-        max_num_tokens=max_num_tokens,
-        num_experts=num_experts,
-        experts_per_token=topk,
-        rank=rank,
-        world_size=world_size,
-        dp_size=dp_size,
-        hidden_dim=hidden_dim,
-        hidden_dim_bytes=hidden_dim_bytes,
-        hidden_dim_scale_bytes=scale_bytes,
-    )
-
-    if group_name is None:
-        ata = AllToAll.internode(**args)
-    else:
-        args["group_name"] = group_name
-        ata = AllToAll.intranode(**args)
-
     topk_ids = topk_ids.to(dtype=torch.uint32)
 
-    prepare_finalize = PplxPrepareAndFinalize(
-        ata,
-        max_num_tokens,
-        world_size,
-        rank,
-        dp_size,
+    experts = BatchedTritonExperts(
+        max_num_tokens=max_num_tokens,
+        num_dispatchers=prepare_finalize.num_dispatchers(),
+        use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
+        block_shape=block_shape,
+        per_act_token_quant=per_act_token_quant,
     )
 
-    experts = BatchedTritonExperts(max_num_tokens=max_num_tokens,
-                                   world_size=world_size,
-                                   dp_size=dp_size,
-                                   use_fp8_w8a8=qtype == torch.float8_e4m3fn,
-                                   block_shape=block_shape)
-
     fused_experts = FusedMoEModularKernel(
         prepare_finalize,
         experts,
     )
 
     # Note: workers with the same dp_rank must use the exact same inputs.
-    a_chunk = chunk_by_rank(a, rank, world_size).to(device)
-    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
-    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
+    a_chunk = chunk_by_rank(a, rank, world_size)
+    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size)
+    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size)
 
     # Chunking weights like this only works for batched format
-    w1_chunk = chunk_by_rank(w1, rank, world_size).to(device)
-    w2_chunk = chunk_by_rank(w2, rank, world_size).to(device)
-
-    if w1_scale is not None:
-        w1_scale_chunk = chunk_by_rank(w1_scale, rank, world_size).to(device)
-        w2_scale_chunk = chunk_by_rank(w2_scale, rank, world_size).to(device)
-    else:
-        w1_scale_chunk = None
-        w2_scale_chunk = None
+    w1_chunk = chunk_by_rank(w1, rank, world_size)
+    w2_chunk = chunk_by_rank(w2, rank, world_size)
+    w1_scale_chunk = maybe_chunk_by_rank(w1_scale, rank, world_size)
+    w2_scale_chunk = maybe_chunk_by_rank(w2_scale, rank, world_size)
+    a1_scale_chunk = chunk_scales_by_rank(a1_scale, rank, world_size)
+    a2_scale_chunk = chunk_scales_by_rank(a2_scale, rank, world_size)
 
     # Note: for now use_compile will error out if the problem size is
     # large enough to trigger chunking. I'm leaving the flag and
@@ -468,6 +570,8 @@ def pplx_moe(
                          chunk_topk_ids,
                          w1_scale=w1_scale_chunk,
                          w2_scale=w2_scale_chunk,
+                         a1_scale=a1_scale_chunk,
+                         a2_scale=a2_scale_chunk,
                          global_num_experts=num_experts)
 
     if use_cudagraphs:
@@ -482,6 +586,8 @@ def pplx_moe(
                                  chunk_topk_ids,
                                  w1_scale=w1_scale_chunk,
                                  w2_scale=w2_scale_chunk,
+                                 a1_scale=a1_scale_chunk,
+                                 a2_scale=a2_scale_chunk,
                                  global_num_experts=num_experts)
 
         torch.cuda.synchronize()
@@ -494,48 +600,6 @@ def pplx_moe(
     return out
 
 
-def _batched_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids):
-    assert torch.cuda.current_device() == pgi.local_rank
-
-    num_experts = w1.shape[0]
-    device = pgi.device
-    rank = pgi.rank
-    world_size = pgi.world_size
-    max_num_tokens = rank_chunk(a.shape[0], 0, world_size)
-
-    prepare_finalize = BatchedPrepareAndFinalize(
-        max_num_tokens=max_num_tokens,
-        world_size=world_size,
-        dp_size=dp_size,
-        rank=rank,
-    )
-
-    experts = NaiveBatchedExperts(max_num_tokens=a.shape[0],
-                                  world_size=1,
-                                  dp_size=1)
-
-    fused_experts = FusedMoEModularKernel(
-        prepare_finalize,
-        experts,
-    )
-
-    # Note: workers with the same dp_rank must use the exact same inputs.
-    a_chunk = chunk_by_rank(a, rank, world_size).to(device)
-    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
-    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
-
-    out = fused_experts(
-        a_chunk,
-        # Chunking weights like this only works for batched format
-        chunk_by_rank(w1, rank, world_size).to(device),
-        chunk_by_rank(w2, rank, world_size).to(device),
-        chunk_topk_weight,
-        chunk_topk_ids,
-        global_num_experts=num_experts)
-
-    return out
-
-
 def _pplx_moe(
     pgi: ProcessGroupInfo,
     dp_size: int,
@@ -544,75 +608,130 @@ def _pplx_moe(
     w2: torch.Tensor,
     score: torch.Tensor,
     topk: int,
+    num_experts: int,
     w1_s: Optional[torch.Tensor] = None,
     w2_s: Optional[torch.Tensor] = None,
-    qtype: Optional[torch.dtype] = None,
+    quant_dtype: Optional[torch.dtype] = None,
     per_act_token_quant: bool = False,
     block_shape: Optional[list[int]] = None,
     use_internode: bool = False,
 ):
-    if use_internode:
-        uid = nvshmem_get_unique_id(
-        ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
-        torch.distributed.broadcast(uid, src=0)
-        nvshmem_init(uid, pgi.rank, pgi.world_size)
-        group_name = None
-    else:
-        group_ranks = list(range(pgi.world_size))
-        cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
-        group_name = cpu_group.group_name
+    try:
+        if use_internode:
+            uid = nvshmem_get_unique_id(
+            ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+            torch.distributed.broadcast(uid, src=0)
+            nvshmem_init(uid, pgi.rank, pgi.world_size)
+            group_name = None
+        else:
+            group_ranks = list(range(pgi.world_size))
+            cpu_group = torch.distributed.new_group(group_ranks,
+                                                    backend="gloo")
+            group_name = cpu_group.group_name
 
-    m, k = a.shape
-    e, _, n = w2.shape
+        m, k = a.shape
+        e, _, n = w2.shape
 
-    moe_config = get_default_config(m, e, n, k, topk, a.dtype, False)
+        moe_config = get_default_config(m, e, n, k, topk, a.dtype, False)
 
-    device = torch.device("cuda", pgi.rank)
-    a = a.to(device)
-    w1 = w1.to(device)
-    w2 = w2.to(device)
-    w1_s = w1_s.to(device) if w1_s is not None else None
-    w2_s = w2_s.to(device) if w2_s is not None else None
+        device = torch.device("cuda", pgi.rank)
+        rank = pgi.rank
+        world_size = pgi.world_size
 
-    with set_current_vllm_config(vllm_config), override_config(moe_config):
-        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-        torch_output = torch_experts(a,
-                                     w1,
-                                     w2,
-                                     topk_weight,
-                                     topk_ids,
-                                     w1_scale=w1_s,
-                                     w2_scale=w2_s,
-                                     quant_dtype=qtype,
-                                     per_act_token_quant=per_act_token_quant,
-                                     block_shape=block_shape)
-        pplx_output = pplx_moe(group_name, pgi.rank, pgi.world_size, dp_size,
-                               a, w1, w2, topk_weight, topk_ids, w1_s, w2_s,
-                               qtype, per_act_token_quant, block_shape)
-        # TODO (bnell): fix + re-enable
-        #batched_output = _batched_moe(pgi, dp_size, a, w1, w2, topk_weight,
-        #                              topk_ids)
+        a = a.to(device)
+        w1 = w1.to(device)
+        w2 = w2.to(device)
+        w1_s = w1_s.to(device) if w1_s is not None else None
+        w2_s = w2_s.to(device) if w2_s is not None else None
 
-    torch_output = chunk_by_rank(torch_output, pgi.rank,
-                                 pgi.world_size).to(pplx_output.device)
+        if (quant_dtype is not None and not per_act_token_quant
+                and block_shape is None):
+            a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+            a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        else:
+            a1_scale = None
+            a2_scale = None
 
-    torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0)
-    #torch.testing.assert_close(batched_output, torch_output, atol=2e-2, rtol=0)
+        with set_current_vllm_config(vllm_config), override_config(moe_config):
+            topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
 
-    if use_internode:
-        nvshmem_finalize()
+            torch_output = torch_experts(
+                a,
+                w1,
+                w2,
+                topk_weight,
+                topk_ids,
+                w1_scale=w1_s,
+                w2_scale=w2_s,
+                a1_scale=a1_scale,
+                a2_scale=a2_scale,
+                quant_dtype=quant_dtype,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+            )
+
+            batched_output = naive_batched_moe(
+                a,
+                w1,
+                w2,
+                topk_weight,
+                topk_ids,
+                w1_scale=w1_s,
+                w2_scale=w2_s,
+                a1_scale=a1_scale,
+                a2_scale=a2_scale,
+                quant_dtype=quant_dtype,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+            )
+
+            pplx_output = pplx_moe(
+                group_name,
+                rank,
+                world_size,
+                dp_size,
+                a,
+                w1,
+                w2,
+                topk_weight,
+                topk_ids,
+                w1_scale=w1_s,
+                w2_scale=w2_s,
+                a1_scale=a1_scale,
+                a2_scale=a2_scale,
+                quant_dtype=quant_dtype,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+            )
+
+        chunked_batch_output = chunk_by_rank(
+            batched_output, pgi.rank, pgi.world_size).to(pplx_output.device)
+
+        torch.testing.assert_close(batched_output,
+                                   torch_output,
+                                   atol=3e-2,
+                                   rtol=3e-2)
+
+        torch.testing.assert_close(pplx_output,
+                                   chunked_batch_output,
+                                   atol=3e-2,
+                                   rtol=3e-2)
+    finally:
+        if use_internode:
+            nvshmem_finalize()
 
 
-@pytest.mark.parametrize("mnk", PPLX_MOE_COMBOS)
+@pytest.mark.parametrize("mnk", PPLX_COMBOS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])
 @pytest.mark.parametrize("per_act_token_quant", [False, True])
 @pytest.mark.parametrize("block_shape", [None, [128, 128]])
 @pytest.mark.parametrize("use_internode", [False])
+@pytest.mark.optional
 @requires_pplx
-def test_pplx_moe(
+def test_pplx_moe_slow(
     mnk: tuple[int, int, int],
     e: int,
     topk: int,
@@ -633,18 +752,143 @@ def test_pplx_moe(
         use_fp8_w8a8 = False
         quant_dtype = None
 
-    if not use_fp8_w8a8 and per_act_token_quant and block_shape is not None:
+    if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
         pytest.skip("Skip quantization test for non-quantized type")
 
+    if per_act_token_quant and block_shape is not None:
+        pytest.skip("Skip illegal quantization combination")
+
     a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
     score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
 
-    _, w1, w1_s, _, w2, w2_s = make_test_weights(e,
-                                                 n,
-                                                 k,
-                                                 quant_dtype=quant_dtype,
-                                                 block_shape=block_shape)
+    _, w1, w1_s, _, w2, w2_s = make_test_weights(
+        e,
+        n,
+        k,
+        quant_dtype=quant_dtype,
+        block_shape=block_shape,
+        per_act_token_quant=per_act_token_quant,
+    )
 
-    parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk,
+    parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk, e,
                     w1_s, w2_s, quant_dtype, per_act_token_quant, block_shape,
                     use_internode)
+
+
+def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
+                    make_weights: bool, test_fn: Callable):
+
+    def format_result(msg, ex=None):
+        if ex is not None:
+            x = str(ex)
+            newx = x.strip(" \n\t")[:16]
+            if len(newx) < len(x):
+                newx = newx + " ..."
+
+            prefix = "E\t"
+            print(f"{textwrap.indent(traceback.format_exc(), prefix)}")
+            print(f"FAILED {msg} - {newx}\n")
+        else:
+            print(f"PASSED {msg}")
+
+    current_platform.seed_everything(7)
+    combos = itertools.product(PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES,
+                               [False, True], [None, [128, 128]])
+    exceptions = []
+    count = 0
+    for mnk, e, topk, dtype, per_act_token_quant, block_shape in combos:
+        count = count + 1
+        m, n, k = mnk
+
+        if dtype == torch.float8_e4m3fn:
+            use_fp8_w8a8 = True
+            quant_dtype = dtype
+        else:
+            use_fp8_w8a8 = False
+            quant_dtype = None
+
+        test_desc = (f"test_pplx_moe[mnk={mnk}, e={e}, topk={topk}, "
+                     f"dtype={dtype}, per_act_token={per_act_token_quant}, "
+                     f"block_shape={block_shape}")
+
+        if not use_fp8_w8a8 and (per_act_token_quant
+                                 or block_shape is not None):
+            print(
+                f"{test_desc} - Skip quantization test for non-quantized type."
+            )
+            continue
+
+        if per_act_token_quant and block_shape is not None:
+            print(f"{test_desc} - Skip illegal quantization combination.")
+            continue
+
+        a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
+        score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
+
+        args = dict()
+        if make_weights:
+            _, w1, w1_s, _, w2, w2_s = make_test_weights(
+                e,
+                n,
+                k,
+                quant_dtype=quant_dtype,
+                block_shape=block_shape,
+                per_act_token_quant=per_act_token_quant,
+            )
+            args["w1"] = w1
+            args["w2"] = w2
+            args["w1_s"] = w1_s
+            args["w2_s"] = w2_s
+
+        try:
+            test_fn(
+                pgi=pgi,
+                dp_size=dp_size,
+                a=a,
+                score=score,
+                topk=topk,
+                num_experts=e,
+                quant_dtype=quant_dtype,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+                use_internode=use_internode,
+                **args,
+            )
+            format_result(test_desc)
+        except Exception as ex:
+            format_result(test_desc, ex)
+            exceptions.append(ex)
+
+    if len(exceptions) > 0:
+        raise RuntimeError(
+            f"{len(exceptions)} of {count} tests failed in child process, "
+            f"rank={pgi.rank}.")
+    else:
+        print(f"{count} of {count} tests passed in child process, "
+              f"rank={pgi.rank}.")
+
+
+@pytest.mark.parametrize("world_dp_size", [[2, 1]])
+@pytest.mark.parametrize("use_internode", [False])
+@requires_pplx
+def test_pplx_prepare_finalize(
+    world_dp_size: tuple[int, int],
+    use_internode: bool,
+):
+    current_platform.seed_everything(7)
+    world_size, dp_size = world_dp_size
+    parallel_launch(world_size * dp_size, _pplx_test_loop, dp_size,
+                    use_internode, False, _pplx_prepare_finalize)
+
+
+@pytest.mark.parametrize("world_dp_size", [[2, 1]])
+@pytest.mark.parametrize("use_internode", [False])
+@requires_pplx
+def test_pplx_moe(
+    world_dp_size: tuple[int, int],
+    use_internode: bool,
+):
+    current_platform.seed_everything(7)
+    world_size, dp_size = world_dp_size
+    parallel_launch(world_size, _pplx_test_loop, dp_size, use_internode, True,
+                    _pplx_moe)
diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py
index 5b1048797447b..df89ad7e6da6f 100644
--- a/tests/kernels/moe/utils.py
+++ b/tests/kernels/moe/utils.py
@@ -63,13 +63,12 @@ def batched_moe(
 
     fused_experts = FusedMoEModularKernel(
         BatchedPrepareAndFinalize(max_num_tokens,
-                                  world_size=1,
-                                  dp_size=1,
+                                  num_dispatchers=1,
+                                  num_local_experts=w1.shape[0],
                                   rank=0),
         BatchedTritonExperts(
             max_num_tokens=max_num_tokens,
-            world_size=1,
-            dp_size=1,
+            num_dispatchers=1,
             use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
             per_act_token_quant=per_act_token_quant,
             block_shape=block_shape,
@@ -105,13 +104,12 @@ def naive_batched_moe(
 
     fused_experts = FusedMoEModularKernel(
         BatchedPrepareAndFinalize(max_num_tokens,
-                                  world_size=1,
-                                  dp_size=1,
+                                  num_dispatchers=1,
+                                  num_local_experts=w1.shape[0],
                                   rank=0),
         NaiveBatchedExperts(
             max_num_tokens=max_num_tokens,
-            dp_size=1,
-            world_size=1,
+            num_dispatchers=1,
             use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
             per_act_token_quant=per_act_token_quant,
             block_shape=block_shape,
diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
index d0dc85f25755b..6f43d1111c98e 100644
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -277,6 +277,24 @@ def dequant(
         return t.to(out_dtype)
 
 
+def batched_dequant(
+    t: torch.Tensor,
+    scale: Optional[torch.Tensor],
+    block_shape: Optional[list[int]],
+    per_act_token_quant: bool,
+    out_dtype: Optional[torch.dtype] = torch.float32,
+) -> torch.Tensor:
+    if scale is not None:
+        assert t.shape[0] == scale.shape[0]
+        out = torch.empty_like(t, dtype=out_dtype)
+        for e in range(t.shape[0]):
+            out[e] = dequant(t[e], scale[e], block_shape, per_act_token_quant,
+                             out_dtype)
+        return out
+
+    return t.to(out_dtype)
+
+
 def native_batched_masked_quant_matmul(
     A: torch.Tensor,
     B: torch.Tensor,
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 84cf87d71d881..fcaa93762856a 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -1094,6 +1094,8 @@ def torch_experts(
     if expert_map is not None:
         topk_ids = expert_map[topk_ids]
 
+    f32 = torch.float32
+
     for i in range(num_experts):
         mask = topk_ids == i
         if mask.sum():
@@ -1109,7 +1111,8 @@ def torch_experts(
                                                 out.dtype)
                 tmp2 = SiluAndMul()(tmp1)
                 tmp2, b_scale = moe_kernel_quantize_input(
-                    tmp2, None, quant_dtype, per_act_token_quant, block_shape)
+                    tmp2, a2_scale, quant_dtype, per_act_token_quant,
+                    block_shape)
 
                 out[mask] = native_w8a8_block_matmul(tmp2, w2[i], b_scale,
                                                      w2_scale[i], block_shape,
@@ -1117,7 +1120,6 @@ def torch_experts(
             else:
                 assert (a_scale is not None and w1_scale is not None
                         and w2_scale is not None)
-                f32 = torch.float32
                 scales = a_scale if a_scale.numel() == 1 else a_scale[mask]
                 tmp1 = a[mask].to(f32) * scales
                 w1_dq = (w1[i].to(f32) * w1_scale[i]).transpose(0, 1)
@@ -1126,8 +1128,8 @@ def torch_experts(
                 w2_dq = (w2[i].to(f32) * w2_scale[i]).transpose(0, 1)
                 out[mask] = (tmp2 @ w2_dq).to(out.dtype)
 
-    return (out.view(M, -1, w2.shape[1]) *
-            topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1)
+    return (out.view(M, -1, w2.shape[1]).to(f32) *
+            topk_weight.view(M, -1, 1)).sum(dim=1).to(out.dtype)
 
 
 def torch_moe(a: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
index 6b08f32dff186..1a1ccf0aaa856 100644
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -184,15 +184,14 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     def __init__(self,
                  max_num_tokens: int,
-                 world_size: int,
-                 dp_size: int,
+                 num_dispatchers: int,
                  block_shape: list[int],
                  per_act_token_quant=False):
         """
         max_num_tokens: Maximum number of tokens from a DP Rank
-        world_size: Number of EP ranks
-        dp_size: Number of data-parallel ranks
-        block_shape: Block quantization block shape
+        num_dispatchers: The number of DP dispatchers.
+        block_shape: Block quantization block shape.
+        per_act_token_quant: Per activation token quantization flag.
         """
         super().__init__(
             FusedMoEQuantConfig(
@@ -202,8 +201,7 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
             ))
         assert self.block_shape == self.DEEPGEMM_BLOCK_SHAPE
         self.max_num_tokens = max_num_tokens
-        self.world_size = world_size
-        self.dp_size = dp_size
+        self.num_dispatchers = num_dispatchers
 
     @property
     def activation_formats(
@@ -233,7 +231,7 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         # FIXME (varun): We should be able to dispatch only from the leader
         # DP ranks in the case of TP > 1. At the moment, all the Ranks
         # end up sending their tokens. This needs to be fixed.
-        num_dispatchers = self.world_size
+        num_dispatchers = self.num_dispatchers
         num_experts = local_num_experts
         max_num_tokens = a.size(
             0) if self.max_num_tokens is None else self.max_num_tokens
diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
index 3682a536cb5ca..7d5c04f2560c8 100644
--- a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
@@ -15,8 +15,7 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     def __init__(self,
                  max_num_tokens: int,
-                 world_size: int,
-                 dp_size: int,
+                 num_dispatchers: int,
                  use_fp8_w8a8: bool = False,
                  use_int8_w8a8: bool = False,
                  use_int8_w8a16: bool = False,
@@ -37,35 +36,28 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
                 block_shape=block_shape,
                 per_act_token_quant=per_act_token_quant,
             ))
-        self.max_num_tokens = max_num_tokens
-        self.world_size = world_size
-        self.dp_size = dp_size
         self.allow_deep_gemm = allow_deep_gemm
 
-        # BatchedTritonKernel doesn't support block quantization
-        # at the moment.
         self.batched_triton_experts = BatchedTritonExperts(
-            max_num_tokens=self.max_num_tokens,
-            world_size=self.world_size,
-            dp_size=self.dp_size,
+            max_num_tokens=max_num_tokens,
+            num_dispatchers=num_dispatchers,
             use_fp8_w8a8=use_fp8_w8a8,
             use_int8_w8a8=use_int8_w8a8,
             use_int8_w8a16=use_int8_w8a16,
             use_int4_w4a16=use_int4_w4a16,
             per_act_token_quant=self.per_act_token_quant,
             block_shape=self.block_shape,
-        ) if self.block_shape is None else None
+        )
 
-        is_fp8_128_block_quantized = (
-            use_fp8_w8a8 and self.block_shape
-            == BatchedDeepGemmExperts.DEEPGEMM_BLOCK_SHAPE)
+        self.allow_deep_gemm = (allow_deep_gemm and use_fp8_w8a8
+                                and self.block_shape
+                                == BatchedDeepGemmExperts.DEEPGEMM_BLOCK_SHAPE)
 
         self.batched_deep_gemm_experts = BatchedDeepGemmExperts(
-            max_num_tokens=self.max_num_tokens,
-            world_size=self.world_size,
-            dp_size=self.dp_size,
+            max_num_tokens=max_num_tokens,
+            num_dispatchers=num_dispatchers,
             block_shape=self.block_shape,  # type: ignore[arg-type]
-        ) if (self.allow_deep_gemm and is_fp8_128_block_quantized) else None
+        ) if self.allow_deep_gemm else None
 
         assert (self.batched_deep_gemm_experts is not None
                 or self.batched_triton_experts is not None)
@@ -138,12 +130,8 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         workspace2: torch.Tensor,
         expert_num_tokens: Optional[torch.Tensor],
     ):
-        use_batched_deep_gemm_experts = (self.allow_deep_gemm
-                                         and self.batched_deep_gemm_experts
-                                         is not None)
         experts = (self.batched_deep_gemm_experts
-                   if use_batched_deep_gemm_experts else
-                   self.batched_triton_experts)
+                   if self.allow_deep_gemm else self.batched_triton_experts)
         assert experts is not None
         experts.apply(output, hidden_states, w1, w2, topk_ids, activation,
                       global_num_experts, expert_map, w1_scale, w2_scale,
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index 9a678406b8f3c..6c03732030d12 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -14,6 +14,7 @@ from vllm.distributed import get_dp_group, get_tensor_model_parallel_rank
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.utils import cdiv
 
 logger = init_logger(__name__)
 
@@ -68,6 +69,57 @@ class FusedMoEQuantConfig:
     # TODO: add col major flag?
     # add detailed quant info for input, intermediates, weights, etc?
 
+    def __post_init__(self):
+        assert (not self.per_act_token_quant
+                or self.block_shape is None), "illegal quantization"
+
+    @property
+    def is_quantized(self) -> bool:
+        return self.quant_dtype is not None
+
+    @property
+    def is_per_act_token(self) -> bool:
+        return self.per_act_token_quant
+
+    @property
+    def is_block_quantized(self) -> bool:
+        return self.block_shape is not None
+
+    @property
+    def is_per_tensor(self) -> bool:
+        return not self.per_act_token_quant and self.block_shape is None
+
+    def scale_shape(
+        self,
+        max_tokens: int,
+        hidden_dim: int,
+    ) -> Optional[tuple[int, int]]:
+        if self.is_quantized:
+            if self.is_block_quantized:
+                assert self.block_shape is not None
+                _, block_k = self.block_shape
+                k_tiles = cdiv(hidden_dim, block_k)
+                return (max_tokens, k_tiles)
+            elif self.is_per_act_token:
+                return (max_tokens, 1)
+            else:
+                return (1, 1)
+        else:
+            return None
+
+    def batched_scale_shape(
+        self,
+        num_experts: int,
+        max_tokens: int,
+        hidden_dim: int,
+    ) -> Optional[tuple[int, int, int]]:
+        if self.is_quantized:
+            scale_shape = self.scale_shape(max_tokens, hidden_dim)
+            assert scale_shape is not None
+            return (num_experts, *scale_shape)
+        else:
+            return None
+
     @staticmethod
     def make(
         use_fp8_w8a8: bool = False,
@@ -109,7 +161,6 @@ class FusedMoEParallelConfig:
     tp_rank: int
     dp_rank: int
     ep_rank: int
-    world_size: int
 
     use_ep: bool  # whether to use EP or not
 
@@ -133,7 +184,7 @@ class FusedMoEParallelConfig:
                 and envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency")
 
     @staticmethod
-    def make(tp_size_: int, dp_size_: int, world_size_: int,
+    def make(tp_size_: int, dp_size_: int,
              vllm_parallel_config: ParallelConfig) -> "FusedMoEParallelConfig":
         """
         Determine MoE parallel configuration. Based on the input tp_size_,
@@ -144,7 +195,6 @@ class FusedMoEParallelConfig:
             tp_size_ (int): tp_size passed into the FusedMoE constructor.
             dp_size_ (int): dp_size passed into the FusedMoE constructor.
             ep_size_ (int): ep_size passed into the FusedMoE constructor.
-            world_size_ (int): the world size of the current All2All manager.
             vllm_parallel_config (ParallelConfig): vllm's parallel config
             object.
 
@@ -223,7 +273,6 @@ class FusedMoEParallelConfig:
                                           dp_rank=dp_rank,
                                           ep_size=1,
                                           ep_rank=0,
-                                          world_size=world_size_,
                                           use_ep=False)
         # DP + EP / TP + EP / DP + TP + EP
         assert use_ep
@@ -237,7 +286,6 @@ class FusedMoEParallelConfig:
                                       dp_rank=dp_rank,
                                       ep_size=ep_size,
                                       ep_rank=ep_rank,
-                                      world_size=world_size_,
                                       use_ep=True)
 
 
@@ -263,6 +311,8 @@ class FusedMoEConfig:
             logger.debug("Using FusedMoEConfig::max_num_tokens=%d",
                          self.max_num_tokens)
 
+        assert self.max_num_tokens > 0
+
     @property
     def quant_dtype(self) -> Optional[torch.dtype]:
         if self.quant_config is not None:
@@ -303,10 +353,6 @@ class FusedMoEConfig:
     def ep_size(self):
         return self.moe_parallel_config.ep_size
 
-    @property
-    def world_size(self):
-        return self.moe_parallel_config.world_size
-
     @property
     def tp_rank(self):
         return self.moe_parallel_config.tp_rank
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index 0ef4e4f767e3f..d889f740a0c4b 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -41,10 +41,7 @@ def run_cutlass_moe_fp8(
     assert w2_scale is not None
     assert w1.dtype == torch.float8_e4m3fn
     assert w2.dtype == torch.float8_e4m3fn
-    if expert_num_tokens is None:
-        assert a1q.size(1) == w1.size(2), "Hidden size mismatch w1"
-    else:
-        assert a1q.size(2) == w1.size(2), "Hidden size mismatch w1"
+    assert a1q.size(-1) == w1.size(2), "Hidden size mismatch w1"
     assert w1.size(1) == w2.size(2) * 2, "Hidden size mismatch w2"
     assert w1_scale.dim() == 1 or w1_scale.size(
         1) == 1 or w1_scale.shape[1] == w1.size(1), "W1 scale shape mismatch"
@@ -178,6 +175,8 @@ def run_cutlass_moe_fp8(
         c2 = _resize_cache(workspace2, (M * topk, N))
         c3 = _resize_cache(workspace13, (M * topk, K))
 
+    c1.fill_(0)
+
     ops.cutlass_moe_mm(c1, a1q, w1, a1q_scale, w1_scale, expert_offsets,
                        problem_sizes1, ab_strides1, ab_strides1, c_strides1,
                        per_act_token, per_out_ch)
@@ -213,6 +212,7 @@ class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute):
         per_act_token_quant: bool,
         per_out_ch_quant: bool,
         block_shape: Optional[list[int]] = None,
+        num_dispatchers: Optional[int] = None,
         use_batched_format: bool = False,
     ):
         super().__init__(
@@ -223,7 +223,9 @@ class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute):
                 block_shape=block_shape,
             ))
         assert max_experts_per_worker > 0
+        assert not use_batched_format or num_dispatchers is not None
         self.max_experts_per_worker = max_experts_per_worker
+        self.num_dispatchers = num_dispatchers
         self.out_dtype = out_dtype
         self.use_batched_format = use_batched_format
 
@@ -260,8 +262,12 @@ class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute):
         output: tuple[int, ...] = ()
         if self.use_batched_format:
             padded_M = aq.size(1)
-            workspace1 = (self.max_experts_per_worker, padded_M, max(N, K))
-            workspace2 = (self.max_experts_per_worker, padded_M, (N // 2))
+            num_dp = self.num_dispatchers
+            assert num_dp is not None
+            workspace1 = (self.max_experts_per_worker, padded_M * num_dp,
+                          max(N, K))
+            workspace2 = (self.max_experts_per_worker, padded_M * num_dp,
+                          (N // 2))
             output = (self.max_experts_per_worker, padded_M, K)
         else:
             workspace1 = (M * topk, max(2 * N, K))
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
index d8ddec9554f00..37998334327fe 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -16,12 +16,11 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     Prepare/Finalize using DeepEP High-Throughput kernels.
     """
 
-    def __init__(self, buffer: deep_ep.Buffer, world_size: int, rank: int,
+    def __init__(self, buffer: deep_ep.Buffer, num_dispatchers: int,
                  dp_size: int, rank_expert_offset: int):
         super().__init__()
         self.buffer = buffer
-        self.world_size = world_size
-        self.rank = rank
+        self.num_dispatchers_ = num_dispatchers
         self.dp_size = dp_size
         self.rank_expert_offset = rank_expert_offset
         # The dispatch function returns a handle that the combine function
@@ -32,6 +31,9 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         # From https://github.com/deepseek-ai/DeepEP/blob/9fe9021f29c9083cd1808ab36b740208524d9f63/deep_ep/buffer.py#L164
         self.available_rank_configs = [2, 4, 8, 16, 24, 32, 64, 128, 144, 160]
 
+    def num_dispatchers(self) -> int:
+        return self.num_dispatchers_
+
     @property
     def activation_format(self) -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.Standard
@@ -136,20 +138,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                 "apply_router_weight_on_input is only implemented for topk=1")
             a1 = a1 * topk_weights.to(a1.dtype)
 
-        # Check if there is a block_shape / or if we can infer the quantization
-        # schemes from the scales.
-        per_token_quant = None
-        if all([
-                x is None
-                for x in [quant_config.block_shape, a1_scale, a2_scale]
-        ]) and quant_config.quant_dtype is not None:
-            # Quantization required despite none of the inputs suggesting
-            # quantization. Fallback to per_token_dynamic quant.
-            per_token_quant = True
-        else:
-            per_token_quant = False
-
-        if per_token_quant:
+        if quant_config.per_act_token_quant:
             a1q, a1q_scale = moe_kernel_quantize_input(
                 a1,
                 a1_scale,
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
index b315b4a97f04c..44d0a2b18b1dd 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -7,7 +7,7 @@ import torch
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.utils import (
-    maybe_fix_scales, moe_kernel_quantize_input)
+    moe_kernel_quantize_input, normalize_batched_scales_shape)
 
 # DeepEP kernels quantize dispatch inputs in 128 element chunks.
 DEEPEP_QUANT_BLOCK_SIZE = 128
@@ -42,20 +42,21 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     def __init__(self,
                  buffer: deep_ep.Buffer,
                  max_tokens_per_rank: int,
-                 world_size: int,
-                 dp_size: int,
+                 num_dispatchers: int,
                  use_fp8_dispatch: bool = False):
         super().__init__()
 
         self.buffer = buffer
         self.max_tokens_per_rank = max_tokens_per_rank
-        self.world_size = world_size
-        self.dp_size = dp_size
         self.use_fp8_dispatch = use_fp8_dispatch
         # The dispatch function returns a handle that the combine function
         # requires. We store the handle here so it is available to the
         # combine function.
         self.handle = None
+        self.num_dispatchers_ = num_dispatchers
+
+    def num_dispatchers(self) -> int:
+        return self.num_dispatchers_
 
     @property
     def activation_format(self) -> mk.FusedMoEActivationFormat:
@@ -91,8 +92,6 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
         assert isinstance(x, torch.Tensor)
 
-        assert not per_act_token_quant
-
         num_experts, max_tokens, hidden_dim = x.size()
 
         # TODO (varun): Optimization - Use a batched version of quant
@@ -104,7 +103,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
         if quant_dtype is not None:
             assert x_scales is not None
-            x_scales = maybe_fix_scales(x_scales, num_experts)
+            x_scales = normalize_batched_scales_shape(x_scales, num_experts)
 
         return x, x_scales
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
index 37a109857ac3d..0355abbf1d2bf 100644
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -12,42 +12,49 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     get_config_dtype_str, try_get_optimal_moe_config)
 from vllm.model_executor.layers.fused_moe.utils import (
-    _resize_cache, moe_kernel_quantize_input)
+    _resize_cache, moe_kernel_quantize_input, normalize_batched_scales_shape,
+    normalize_scales_shape)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    group_broadcast)
 
 
 @triton.jit
 def moe_mmk(
-        a_ptrs,
-        b_ptrs,
-        K,
-        expert_id,
-        a_scale_ptr,
-        b_scale_ptr,
-        # The stride variables represent how much to increase the ptr by when
-        # moving by 1 element in a particular dimension. E.g. `stride_am` is
-        # how much to increase `a_ptr` by to get the element one row down
-        # (A has M rows).
-        stride_ak,
-        stride_bk,
-        stride_asm,
-        stride_ask,
-        stride_bse,
-        stride_bsk,
-        stride_bsn,
-        # Offsets and masks
-        offs_m,
-        offs_n,
-        mask_m,
-        # Block size for block-wise quantization
-        group_n: tl.constexpr,
-        group_k: tl.constexpr,
-        # Meta-parameters
-        BLOCK_M: tl.constexpr,
-        BLOCK_N: tl.constexpr,
-        BLOCK_K: tl.constexpr,
-        compute_type: tl.constexpr,
-        use_w8a8: tl.constexpr,
-        use_w8a16: tl.constexpr):
+    a_ptrs,
+    b_ptrs,
+    K,
+    expert_id,
+    a_scale_ptr,
+    b_scale_ptr,
+    # The stride variables represent how much to increase the ptr by when
+    # moving by 1 element in a particular dimension. E.g. `stride_am` is
+    # how much to increase `a_ptr` by to get the element one row down
+    # (A has M rows).
+    stride_ak: tl.int64,
+    stride_bk: tl.int64,
+    stride_ase: tl.int64,
+    stride_asm: tl.int64,
+    stride_ask: tl.int64,
+    stride_bse: tl.int64,
+    stride_bsk: tl.int64,
+    stride_bsn: tl.int64,
+    # Offsets and masks
+    offs_m,
+    offs_n,
+    offs_bn,
+    mask_m,
+    # Block size for block-wise quantization
+    group_n: tl.constexpr,
+    group_k: tl.constexpr,
+    # Meta-parameters
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    compute_type: tl.constexpr,
+    use_w8a8: tl.constexpr,
+    use_w8a16: tl.constexpr,
+    per_act_token_quant: tl.constexpr,
+):
 
     offs_k = tl.arange(0, BLOCK_K)
 
@@ -60,13 +67,22 @@ def moe_mmk(
         # block-wise
         if group_k > 0 and group_n > 0:
             a_scale_ptrs = a_scale_ptr + offs_m * stride_asm
-            offs_bsn = offs_n // group_n
-            b_scale_ptrs = (b_scale_ptr + expert_id * stride_bse +
-                            offs_bsn * stride_bsn)
+            offs_bsn = offs_bn // group_n
+            b_scale_ptrs = b_scale_ptr + offs_bsn * stride_bsn
+
+        # per act token
+        elif per_act_token_quant:
+            # Load per-token scale for activations
+            a_scale_ptrs = a_scale_ptr + offs_m * stride_asm
+            a_scale = tl.load(a_scale_ptrs, mask=mask_m, other=0.0)[:, None]
+
+            b_scale_ptrs = b_scale_ptr + offs_bn[None, :] * stride_bsn
+            b_scale = tl.load(b_scale_ptrs)
+
         # tensor-wise
         else:
             a_scale = tl.load(a_scale_ptr)
-            b_scale = tl.load(b_scale_ptr + expert_id)
+            b_scale = tl.load(b_scale_ptr)
 
     # -----------------------------------------------------------
     # Iterate to compute a block of the C matrix.
@@ -96,13 +112,11 @@ def moe_mmk(
                 accumulator += tl.dot(a, b) * a_scale[:,
                                                       None] * b_scale[None, :]
             else:
-                if use_w8a8:
-                    # acc used to enable fp8_fast_accum
-                    accumulator = tl.dot(a, b, acc=accumulator)
-                else:
-                    accumulator += tl.dot(a, b)
+                # acc used to enable fp8_fast_accum
+                accumulator = tl.dot(a, b, acc=accumulator)
         else:
             accumulator += tl.dot(a, b)
+
         # Advance the ptrs to the next K block.
         a_ptrs += BLOCK_K * stride_ak
         b_ptrs += BLOCK_K * stride_bk
@@ -122,47 +136,53 @@ def moe_mmk(
 
 @triton.jit
 def expert_triton_kernel(
-        a_ptr,  #[max_tokens, K]
-        b_ptr,  #[K, N]
-        c_ptr,  #[max_tokens, N]
-        expert_id,
-        compute_type: tl.constexpr,
-        # Dimensions
-        M,
-        N,
-        K,
-        # Quantization data
-        a_scale_ptr,
-        b_scale_ptr,
-        b_zp_ptr,
-        # strides
-        stride_am,
-        stride_ak,
-        stride_bk,
-        stride_bn,
-        stride_cm,
-        stride_cn,
-        stride_asm,
-        stride_ask,
-        stride_bse,
-        stride_bsk,
-        stride_bsn,
-        # Blockwise quantization data
-        group_n,
-        group_k,
-        # Quantization schemes
-        use_fp8_w8a8: tl.constexpr,
-        use_int8_w8a16: tl.constexpr,
-        # Kernel config
-        BLOCK_M: tl.constexpr,
-        BLOCK_N: tl.constexpr,
-        BLOCK_K: tl.constexpr):
+    a_ptr,  #[max_tokens, K]
+    b_ptr,  #[K, N]
+    c_ptr,  #[max_tokens, N]
+    expert_id,
+    compute_type: tl.constexpr,
+    # Dimensions
+    M,
+    N,
+    K,
+    # Quantization data
+    a_scale_ptr,
+    b_scale_ptr,
+    b_zp_ptr,
+    # strides
+    stride_am: tl.int64,
+    stride_ak: tl.int64,
+    stride_bk: tl.int64,
+    stride_bn: tl.int64,
+    stride_cm: tl.int64,
+    stride_cn: tl.int64,
+    stride_ase: tl.int64,
+    stride_asm: tl.int64,
+    stride_ask: tl.int64,
+    stride_bse: tl.int64,
+    stride_bsk: tl.int64,
+    stride_bsn: tl.int64,
+    # offsets
+    offs_bn,
+    # Blockwise quantization data
+    group_n,
+    group_k,
+    # Quantization schemes
+    use_fp8_w8a8: tl.constexpr,
+    use_int8_w8a16: tl.constexpr,
+    per_act_token_quant: tl.constexpr,
+    # Kernel config
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
 
     offs_m = tl.arange(0, BLOCK_M)
     offs_n = tl.arange(0, BLOCK_N) % N
     offs_k = tl.arange(0, BLOCK_K)
     mask_m = offs_m < M
 
+    # Make grids of a + b pointers
     a_ptrs = a_ptr + offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak
     b_ptrs = b_ptr + offs_k[:, None] * stride_bk + offs_n[None, :] * stride_bn
 
@@ -179,6 +199,7 @@ def expert_triton_kernel(
         # (A has M rows).
         stride_ak,
         stride_bk,
+        stride_ase,
         stride_asm,
         stride_ask,
         stride_bse,
@@ -187,6 +208,7 @@ def expert_triton_kernel(
         # Offsets and masks
         offs_m,
         offs_n,
+        offs_bn,
         mask_m,
         # Block size for block-wise quantization
         group_n,
@@ -197,7 +219,8 @@ def expert_triton_kernel(
         BLOCK_K,
         compute_type,
         use_fp8_w8a8,
-        use_int8_w8a16)
+        use_int8_w8a16,
+        per_act_token_quant)
 
     # store in C
     offs_cn = tl.arange(0, BLOCK_N)
@@ -208,53 +231,57 @@ def expert_triton_kernel(
 
 @triton.jit
 def batched_triton_kernel(
-        a_ptr,  # [E, max_num_tokens, K]
-        b_ptr,  # [E, K, N]
-        c_ptr,  # [E, max_num_tokens, N]
-        expert_num_tokens,  # [E]
-        compute_type: tl.constexpr,
-        # Dimensions
-        max_num_tokens,
-        K,
-        N,
-        # Quantization data
-        a_scale_ptr,
-        b_scale_ptr,
-        b_zp_ptr,
-        # The stride variables represent how much to increase the ptr by when
-        # moving by 1 element in a particular dimension. E.g. `stride_am` is
-        # how much to increase `a_ptr` by to get the element one row down
-        # (A has M rows).
-        stride_ae,
-        stride_am,
-        stride_ak,
-        stride_be,
-        stride_bk,
-        stride_bn,
-        stride_ce,
-        stride_cm,
-        stride_cn,
-        stride_asm,
-        stride_ask,
-        stride_bse,
-        stride_bsk,
-        stride_bsn,
-        # Blockwise quantization data
-        group_n: tl.constexpr,
-        group_k: tl.constexpr,
-        # Quantization schemes
-        use_fp8_w8a8: tl.constexpr,
-        use_int8_w8a16: tl.constexpr,
-        # Kernel config
-        BLOCK_M: tl.constexpr,
-        BLOCK_N: tl.constexpr,
-        BLOCK_K: tl.constexpr):
+    a_ptr,  # [E, max_num_tokens, K]
+    b_ptr,  # [E, K, N]
+    c_ptr,  # [E, max_num_tokens, N]
+    expert_num_tokens,  # [E]
+    compute_type: tl.constexpr,
+    # Dimensions
+    max_num_tokens,
+    K,
+    N,
+    # Quantization data
+    a_scale_ptr,
+    b_scale_ptr,
+    b_zp_ptr,
+    # The stride variables represent how much to increase the ptr by when
+    # moving by 1 element in a particular dimension. E.g. `stride_am` is
+    # how much to increase `a_ptr` by to get the element one row down
+    # (A has M rows).
+    stride_ae: tl.int64,
+    stride_am: tl.int64,
+    stride_ak: tl.int64,
+    stride_be: tl.int64,
+    stride_bk: tl.int64,
+    stride_bn: tl.int64,
+    stride_ce: tl.int64,
+    stride_cm: tl.int64,
+    stride_cn: tl.int64,
+    stride_ase: tl.int64,
+    stride_asm: tl.int64,
+    stride_ask: tl.int64,
+    stride_bse: tl.int64,
+    stride_bsk: tl.int64,
+    stride_bsn: tl.int64,
+    # Blockwise quantization data
+    group_n: tl.constexpr,
+    group_k: tl.constexpr,
+    # Quantization schemes
+    use_fp8_w8a8: tl.constexpr,
+    use_int8_w8a16: tl.constexpr,
+    per_act_token_quant: tl.constexpr,
+    # Kernel config
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
     expert_id = tl.program_id(axis=0)
     e_num_tokens = tl.load(expert_num_tokens + expert_id)
     if e_num_tokens == 0:
         # Early exit
         return
 
+    # axis 1 is M_blocks * N_blocks
     pid_mn = tl.program_id(axis=1)
     #num_pid_m = tl.cdiv(max_num_tokens, BLOCK_M)
     num_pid_n = tl.cdiv(N, BLOCK_N)
@@ -275,6 +302,16 @@ def batched_triton_kernel(
     c_ptr = (c_ptr + expert_id * stride_ce + cta_m_start * stride_cm +
              cta_n_start * stride_cn)
 
+    offs_bn = (pid_n * BLOCK_N + tl.arange(0, BLOCK_N).to(tl.int64)) % N
+
+    if use_fp8_w8a8:
+        a_scale_ptr = a_scale_ptr + expert_id * stride_ase
+        b_scale_ptr = b_scale_ptr + expert_id * stride_bse
+
+        # block-wise
+        if group_k > 0 and group_n > 0 or per_act_token_quant:
+            a_scale_ptr = a_scale_ptr + cta_m_start * stride_asm
+
     expert_triton_kernel(
         a_ptr,
         b_ptr,
@@ -294,17 +331,21 @@ def batched_triton_kernel(
         stride_bn,
         stride_cm,
         stride_cn,
+        stride_ase,
         stride_asm,
         stride_ask,
         stride_bse,
         stride_bsk,
         stride_bsn,
+        # offsets
+        offs_bn,
         # Blockwise quantization data
         group_n,
         group_k,
         # Quantization schemes
         use_fp8_w8a8,
         use_int8_w8a16,
+        per_act_token_quant,
         # Kernel config
         BLOCK_M,
         BLOCK_N,
@@ -326,6 +367,7 @@ def invoke_moe_batched_triton_kernel(
         use_int8_w8a16: bool,
         use_int4_w4a16: bool,
         config: dict[str, int],
+        per_act_token_quant: bool,
         block_shape: Optional[list[int]] = None):
 
     assert not use_int4_w4a16
@@ -340,6 +382,42 @@ def invoke_moe_batched_triton_kernel(
     grid = (expert_num_tokens.size(0), triton.cdiv(max_num_tokens, BLOCK_M) *
             triton.cdiv(B.size(1), BLOCK_N))
 
+    A_scale = normalize_batched_scales_shape(A_scale,
+                                             expert_num_tokens.shape[0])
+
+    if B_scale is not None and B_scale.ndim == 1:
+        assert B_scale.numel() == expert_num_tokens.shape[0]
+        B_scale = B_scale.view(-1, 1, 1)
+
+    assert A_scale is None or A_scale.ndim == 3, (
+        f"{0 if A_scale is None else A_scale.shape}")
+    assert B_scale is None or B_scale.ndim == 1 or B_scale.ndim == 3, (
+        f"{0 if B_scale is None else B_scale.shape}")
+
+    if B_scale is not None:
+        if B_scale.ndim == 1:
+            stride_bse = 1
+            stride_bsk = 0
+            stride_bsn = 0
+        else:
+            stride_bse = B_scale.stride(0)
+            stride_bsk = B_scale.stride(2)
+            stride_bsn = B_scale.stride(1)
+
+    else:
+        stride_bse = 0
+        stride_bsk = 0
+        stride_bsn = 0
+
+    if A_scale is not None:
+        stride_ase = A_scale.stride(0)
+        stride_asm = A_scale.stride(1)
+        stride_ask = A_scale.stride(2)
+    else:
+        stride_ase = 0
+        stride_asm = 0
+        stride_ask = 0
+
     batched_triton_kernel[grid](
         A,
         B,
@@ -364,17 +442,19 @@ def invoke_moe_batched_triton_kernel(
         C.stride(0),
         C.stride(1),
         C.stride(2),
-        A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0,
-        A_scale.stride(1) if A_scale is not None and A_scale.ndim == 2 else 0,
-        B_scale.stride(0) if B_scale is not None and B_scale.ndim >= 2 else 0,
-        B_scale.stride(2) if B_scale is not None and B_scale.ndim == 3 else 0,
-        B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0,
+        stride_ase,
+        stride_asm,
+        stride_ask,
+        stride_bse,
+        stride_bsk,
+        stride_bsn,
         # Blockwise quantization data
         0 if block_shape is None else block_shape[0],
         0 if block_shape is None else block_shape[1],
         # Quantization schemes
         use_fp8_w8a8,
         use_int8_w8a16,
+        per_act_token_quant,
         # Kernel config
         BLOCK_M=BLOCK_M,
         BLOCK_N=BLOCK_N,
@@ -391,15 +471,15 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     def __init__(
         self,
         max_num_tokens: int,
-        world_size: int,
-        dp_size: int,
+        num_local_experts: int,
+        num_dispatchers: int,
         rank: int,
     ):
         super().__init__()
-        self.world_size = world_size
-        self.dp_size = dp_size
-        self.rank = rank
         self.max_num_tokens = max_num_tokens
+        self.num_local_experts = num_local_experts
+        self.rank = rank
+        self.num_dispatchers_ = num_dispatchers
 
     @property
     def activation_format(self) -> mk.FusedMoEActivationFormat:
@@ -411,6 +491,9 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     def topk_indices_dtype(self) -> Optional[torch.dtype]:
         return None
 
+    def num_dispatchers(self) -> int:
+        return self.num_dispatchers_
+
     def prepare(
         self,
         a1: torch.Tensor,
@@ -442,9 +525,7 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                                         dtype=torch.int,
                                         device=a1.device)
 
-        assert num_experts % self.world_size == 0
-
-        num_local_experts = num_experts // self.world_size
+        num_local_experts = self.num_local_experts
 
         if quant_config.quant_dtype is None:
             b_type = a1.dtype
@@ -456,21 +537,53 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             dtype=b_type,
             device=a1.device)
 
-        b_a1_scale = None
+        if quant_config.is_quantized:
+            scale_shape = quant_config.batched_scale_shape(
+                num_local_experts, self.max_num_tokens, hidden_dim)
 
-        assert quant_config.quant_dtype is None, "quantization NYI"
+            b_a1_scale = torch.empty(scale_shape,
+                                     dtype=torch.float32,
+                                     device=a1.device)
+        else:
+            assert a1_scale is None
+            b_a1_scale = None
 
         first_expert = num_local_experts * self.rank
         last_expert = first_expert + num_local_experts
 
+        a1_scale = normalize_scales_shape(a1_scale)
+        a2_scale = normalize_scales_shape(a2_scale)
+
         for expert_id in range(first_expert, last_expert):
             topks = torch.any(topk_ids == expert_id, dim=1).flatten()
             rows = torch.count_nonzero(topks.flatten())
             if rows == 0:
                 continue
             idx = expert_id - first_expert
-            b_a1[idx, :rows, :] = a1[:topks.numel()][topks]
             tokens_per_expert[idx] = rows
+            rhs = a1[:topks.numel()][topks]
+            if quant_config.quant_dtype is not None:
+                if a1_scale is not None:
+                    if quant_config.is_per_act_token:
+                        rhs_a1_scale = a1_scale[:topks.numel()][topks]
+                    else:
+                        rhs_a1_scale = a1_scale
+                else:
+                    rhs_a1_scale = None
+                b_a1[idx, :rows, :], b_s = moe_kernel_quantize_input(
+                    rhs,
+                    rhs_a1_scale,
+                    quant_config.quant_dtype,
+                    quant_config.per_act_token_quant,
+                    quant_config.block_shape,
+                )
+                assert b_s is not None
+                if quant_config.is_per_act_token:
+                    b_a1_scale[idx, :rows] = b_s[:rows]
+                else:
+                    b_a1_scale[idx, :b_s.shape[0]] = b_s
+            else:
+                b_a1[idx, :rows, :] = rhs
 
         assert b_a1_scale is None or b_a1_scale.ndim == 3
 
@@ -514,8 +627,7 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
     def __init__(
         self,
         max_num_tokens: int,
-        world_size: int,
-        dp_size: int,
+        num_dispatchers: int,
         use_fp8_w8a8: bool = False,
         use_int8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
@@ -532,13 +644,11 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
                 per_act_token_quant=per_act_token_quant,
                 block_shape=block_shape,
             ))
-        assert not use_fp8_w8a8, "NYI"
         assert not use_int8_w8a8, "NYI"
         assert not use_int8_w8a16, "NYI"
         assert not use_int4_w4a16, "NYI"
         self.max_num_tokens = max_num_tokens
-        self.world_size = world_size
-        self.dp_size = dp_size
+        self.num_dispatchers = num_dispatchers
 
     @property
     def activation_formats(
@@ -565,11 +675,21 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
         local_num_experts: int,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
         assert a.dim() == 2
-        num_dp = self.dp_size
+        num_dp = self.num_dispatchers
         num_experts = local_num_experts
         workspace13 = (num_experts, self.max_num_tokens * num_dp, K)
         workspace2 = (self.max_num_tokens * num_dp, N)
-        return (workspace13, workspace2, workspace13, a.dtype)
+        output = workspace13
+        return (workspace13, workspace2, output, a.dtype)
+
+    def dequant(self, t: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+        assert self.quant_config.is_quantized
+        f32 = torch.float32
+        if (self.quant_config.is_per_act_token
+                or self.quant_config.is_per_tensor):
+            return t.to(f32) * scale
+        else:
+            return t.to(f32) * group_broadcast(scale, t.shape)
 
     def apply(
         self,
@@ -612,9 +732,95 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
                 continue
 
             tmp = _resize_cache(workspace2, (num, N))
-            input = hidden_states[expert, :num, :] @ w1[expert].transpose(0, 1)
-            self.activation(activation, tmp, input)
-            output[expert, :num, :] = tmp @ w2[expert].transpose(0, 1)
+
+            if self.quant_config.is_quantized:
+                assert a1q_scale is not None and w1_scale is not None
+                input = self.dequant(hidden_states[expert, :, :],
+                                     a1q_scale[expert])
+                w1_dq = self.dequant(w1[expert], w1_scale[expert])
+                input = input[:num] @ w1_dq.transpose(0, 1)
+            else:
+                input = hidden_states[expert, :num, :] @ w1[expert].transpose(
+                    0, 1)
+
+            self.activation(activation, tmp, input.to(tmp.dtype))
+
+            if self.quant_config.is_quantized:
+                assert w2_scale is not None
+                w2_dq = self.dequant(w2[expert], w2_scale[expert])
+            else:
+                w2_dq = w2[expert]
+
+            output[expert, :num, :] = tmp @ w2_dq.transpose(0, 1).to(tmp.dtype)
+
+
+def batched_moe_kernel_quantize_input(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    num_tokens: int,
+    E: int,
+    N: int,
+    expert_num_tokens: torch.Tensor,
+    qtype: Optional[torch.dtype],
+    per_act_token_quant: bool,
+    block_shape: Optional[list[int]] = None,
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    if (torch.compiler.is_compiling()
+            or torch.cuda.is_current_stream_capturing()):
+        # Note: this does a bunch of extra work because expert_num_tokens is
+        # ignored but it does support torch.compile + cudagraphs.
+        hidden_dim = A.size(-1)
+        assert A_scale is None or A_scale.ndim <= 2, (
+            f"{A_scale.shape if A_scale is not None else None}")
+        A_q, A_q_scale = moe_kernel_quantize_input(A.view(-1,
+                                                          hidden_dim), A_scale,
+                                                   qtype, per_act_token_quant,
+                                                   block_shape)
+        A_q = A_q.view(E, -1, hidden_dim)
+        A_q_scale = normalize_batched_scales_shape(A_q_scale, E)
+
+        return A_q, A_q_scale
+    elif qtype is None:
+        return A, normalize_batched_scales_shape(A_scale, E)
+    else:
+        A_q = torch.empty_like(A, dtype=qtype)
+
+        if per_act_token_quant:
+            assert block_shape is None
+            scale_shape = (E, num_tokens, 1)
+        elif block_shape is not None:
+            _, block_k = block_shape
+            k_tiles = (A.shape[-1] + block_k - 1) // block_k
+            scale_shape = (E, num_tokens, k_tiles)
+        else:
+            scale_shape = (E, 1, 1)
+
+        A_q_scale = torch.zeros(scale_shape,
+                                dtype=torch.float32,
+                                device=A.device)
+
+        num_experts = expert_num_tokens.numel()
+
+        A_scale = normalize_batched_scales_shape(A_scale, num_experts)
+
+        for e in range(E):
+            num_tokens = int(expert_num_tokens[e].item())
+            if num_tokens > 0:
+                if A_scale is not None:
+                    scales = A_scale[e, :min(num_tokens, A_scale.shape[1])]
+                else:
+                    scales = None
+                A_q[e, :num_tokens], tmp_scale = moe_kernel_quantize_input(
+                    A[e, :num_tokens],
+                    scales,
+                    qtype,
+                    per_act_token_quant,
+                    block_shape,
+                )
+                assert tmp_scale is not None
+                A_q_scale[e, :tmp_scale.shape[0]] = tmp_scale
+
+        return A_q, A_q_scale
 
 
 class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
@@ -627,8 +833,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
     def __init__(
         self,
         max_num_tokens: int,
-        world_size: int,
-        dp_size: int,
+        num_dispatchers: int,
         use_fp8_w8a8: bool = False,
         use_int8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
@@ -648,17 +853,14 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         assert not use_int8_w8a8, "NYI"
         assert not use_int8_w8a16, "NYI"
         assert not use_int4_w4a16, "NYI"
+        assert max_num_tokens > 0
+        assert num_dispatchers > 0
         self.use_fp8_w8a8 = use_fp8_w8a8
         self.use_int8_w8a8 = use_int8_w8a8
         self.use_int4_w4a16 = use_int4_w4a16
         self.use_int8_w8a16 = use_int8_w8a16
         self.max_num_tokens = max_num_tokens
-        self.world_size = world_size
-        self.dp_size = dp_size
-        assert world_size > 0
-        assert dp_size > 0
-        assert dp_size <= world_size
-        assert max_num_tokens > 0
+        self.num_dispatchers = num_dispatchers
 
     @property
     def activation_formats(
@@ -685,7 +887,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         local_num_experts: int,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
         assert a.dim() == 2
-        num_dp = self.world_size
+        num_dp = self.num_dispatchers
         num_experts = local_num_experts
         max_num_tokens = self.max_num_tokens
         workspace13 = (num_experts, max_num_tokens * num_dp, max(K, N))
@@ -772,51 +974,48 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         if self.use_fp8_w8a8:
             intermediate_cache1.fill_(0)
 
+        a1q_scale = normalize_batched_scales_shape(a1q_scale, E)
+
         # MM1
-        invoke_moe_batched_triton_kernel(A=hidden_states,
-                                         B=w1,
-                                         C=intermediate_cache1,
-                                         expert_num_tokens=expert_num_tokens,
-                                         compute_type=compute_type,
-                                         A_scale=a1q_scale,
-                                         B_scale=w1_scale,
-                                         B_zp=w1_zp,
-                                         use_fp8_w8a8=self.use_fp8_w8a8,
-                                         use_int8_w8a16=self.use_int8_w8a16,
-                                         use_int4_w4a16=self.use_int4_w4a16,
-                                         config=config,
-                                         block_shape=self.block_shape)
-
-        intermediate_cache2.fill_(0)
-
-        # TODO: would be nice to use expert_num_tokens here to reduce
-        # garbage compute
-        self.activation(activation, intermediate_cache2.view(-1, N // 2),
-                        intermediate_cache1.view(-1, N))
-
-        ic2_hidden_size = intermediate_cache2.size(-1)
-        intermediate_cache2 = intermediate_cache2.view(-1, ic2_hidden_size)
-
-        qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
-            A=intermediate_cache2,
-            A_scale=a2_scale,
-            quant_dtype=self.quant_dtype,
+        invoke_moe_batched_triton_kernel(
+            A=hidden_states,
+            B=w1,
+            C=intermediate_cache1,
+            expert_num_tokens=expert_num_tokens,
+            compute_type=compute_type,
+            A_scale=a1q_scale,
+            B_scale=w1_scale,
+            B_zp=w1_zp,
+            use_fp8_w8a8=self.use_fp8_w8a8,
+            use_int8_w8a16=self.use_int8_w8a16,
+            use_int4_w4a16=self.use_int4_w4a16,
+            config=config,
             per_act_token_quant=self.per_act_token_quant,
             block_shape=self.block_shape)
 
-        qintermediate_cache2 = qintermediate_cache2.view(
-            (E, -1, ic2_hidden_size))
+        intermediate_cache2.fill_(0)
 
-        invoke_moe_batched_triton_kernel(A=qintermediate_cache2,
-                                         B=w2,
-                                         C=output,
-                                         expert_num_tokens=expert_num_tokens,
-                                         compute_type=compute_type,
-                                         A_scale=a2q_scale,
-                                         B_scale=w2_scale,
-                                         B_zp=w2_zp,
-                                         use_fp8_w8a8=self.use_fp8_w8a8,
-                                         use_int8_w8a16=self.use_int8_w8a16,
-                                         use_int4_w4a16=self.use_int4_w4a16,
-                                         config=config,
-                                         block_shape=self.block_shape)
+        # TODO (bnell): use triton utility from batched deep gemm.
+        self.activation(activation, intermediate_cache2.view(-1, N // 2),
+                        intermediate_cache1.view(-1, N))
+
+        qintermediate_cache2, a2q_scale = batched_moe_kernel_quantize_input(
+            intermediate_cache2, a2_scale, max_num_tokens, E, N,
+            expert_num_tokens, self.quant_dtype, self.per_act_token_quant,
+            self.block_shape)
+
+        invoke_moe_batched_triton_kernel(
+            A=qintermediate_cache2,
+            B=w2,
+            C=output,
+            expert_num_tokens=expert_num_tokens,
+            compute_type=compute_type,
+            A_scale=a2q_scale,
+            B_scale=w2_scale,
+            B_zp=w2_zp,
+            use_fp8_w8a8=self.use_fp8_w8a8,
+            use_int8_w8a16=self.use_int8_w8a16,
+            use_int4_w4a16=self.use_int4_w4a16,
+            config=config,
+            per_act_token_quant=self.per_act_token_quant,
+            block_shape=self.block_shape)
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 75712b8e3a4d6..041819bb7b08a 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1127,6 +1127,8 @@ def dispatch_fused_experts_func(inplace: bool) -> Callable[..., torch.Tensor]:
     return torch_vllm_outplace_fused_experts
 
 
+# TODO (bnell): replace this with modular op.  Can get rid of inplace/outplace
+# torch ops.
 def fused_experts(hidden_states: torch.Tensor,
                   w1: torch.Tensor,
                   w2: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 6f97702628569..648dfca374c5b 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -14,7 +14,6 @@ import vllm.envs as envs
 from vllm.config import get_current_vllm_config
 from vllm.distributed import (get_dp_group, get_ep_group,
                               get_tensor_model_parallel_world_size,
-                              get_world_group,
                               tensor_model_parallel_all_reduce)
 from vllm.distributed.eplb.eplb_state import EplbState
 from vllm.forward_context import ForwardContext, get_forward_context
@@ -114,6 +113,9 @@ class FusedMoEMethodBase(QuantizeMethodBase):
                 hidden_dim_scale_bytes=hidden_scale_bytes,
             )
 
+            num_dispatchers = (all2all_manager.world_size //
+                               all2all_manager.tp_group.world_size)
+
             # Intranode pplx a2a takes a group name while internode does not.
             if not all2all_manager.internode:
                 all_to_all_args[
@@ -124,10 +126,8 @@ class FusedMoEMethodBase(QuantizeMethodBase):
             prepare_finalize = PplxPrepareAndFinalize(
                 handle,
                 max_num_tokens=moe.max_num_tokens,
-                world_size=all2all_manager.world_size,
-                rank=all2all_manager.rank,
-                # dp_size actually means tp_size, bug in pplx kernels
-                dp_size=all2all_manager.tp_group.world_size,
+                num_local_experts=moe.num_local_experts,
+                num_dispatchers=num_dispatchers,
             )
         elif moe.use_deepep_ht_kernels:
             assert moe.dp_size == all2all_manager.dp_world_size
@@ -136,16 +136,13 @@ class FusedMoEMethodBase(QuantizeMethodBase):
             handle = all2all_manager.get_handle(all_to_all_args)
             prepare_finalize = DeepEPHTPrepareAndFinalize(
                 handle,
-                world_size=all2all_manager.world_size,
-                rank=all2all_manager.rank,
+                num_dispatchers=all2all_manager.world_size,
                 dp_size=all2all_manager.dp_world_size,
                 rank_expert_offset=all2all_manager.rank *
                 moe.num_local_experts,
             )
 
         elif moe.use_deepep_ll_kernels:
-            assert moe.dp_size == all2all_manager.dp_world_size
-
             all_to_all_args = dict(
                 max_num_tokens_per_dp_rank=moe.max_num_tokens,
                 token_hidden_size=moe.hidden_dim,
@@ -168,8 +165,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
             prepare_finalize = DeepEPLLPrepareAndFinalize(
                 handle,
                 max_tokens_per_rank=moe.max_num_tokens,
-                world_size=all2all_manager.world_size,
-                dp_size=all2all_manager.dp_world_size,
+                num_dispatchers=all2all_manager.world_size,
                 use_fp8_dispatch=use_fp8_dispatch,
             )
 
@@ -245,18 +241,12 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
 
         assert self.fused_experts == fused_experts
 
-        all2all_manager = get_ep_group().device_communicator.all2all_manager
-        assert all2all_manager is not None
-
         if (prepare_finalize.activation_format ==
                 FusedMoEActivationFormat.BatchedExperts):
             logger.debug("BatchedTritonExperts %s", self.moe)
-            assert self.moe.dp_size == all2all_manager.dp_world_size
             return BatchedTritonExperts(
                 max_num_tokens=self.moe.max_num_tokens,
-                world_size=all2all_manager.world_size,
-                # dp_size actually means tp_size, bug in pplx kernels
-                dp_size=all2all_manager.tp_group.world_size,
+                num_dispatchers=prepare_finalize.num_dispatchers(),
             )
         else:
             logger.debug("TritonExperts %s", self.moe)
@@ -652,14 +642,12 @@ class FusedMoE(torch.nn.Module):
                     get_tensor_model_parallel_world_size())
         dp_size_ = (dp_size
                     if dp_size is not None else get_dp_group().world_size)
-        world_size_ = get_world_group().world_size
 
         vllm_config = get_current_vllm_config()
         self.moe_parallel_config: FusedMoEParallelConfig = (
             FusedMoEParallelConfig.make(
                 tp_size_=tp_size_,
                 dp_size_=dp_size_,
-                world_size_=world_size_,
                 vllm_parallel_config=vllm_config.parallel_config))
 
         self.global_num_experts = num_experts + num_redundant_experts
@@ -1186,9 +1174,9 @@ class FusedMoE(torch.nn.Module):
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
-        Route the input hidden states to the top-k experts based on the 
+        Route the input hidden states to the top-k experts based on the
         router logits.
-        
+
         Returns:
             (topk_weights, topk_ids) (tuple[torch.Tensor, torch.Tensor]):
             The weights and *global physical* expert ids of the top-k experts.
@@ -1299,6 +1287,8 @@ class FusedMoE(torch.nn.Module):
 
             topk_ids = topk_ids.to(dtype=indices_type)
 
+        assert topk_ids.dtype == indices_type or indices_type is None
+
         return topk_weights, topk_ids
 
     def must_reduce_shared_expert_outputs(self) -> bool:
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 2ffb4d328eca5..f332b51689139 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -193,6 +193,10 @@ class FusedMoEPrepareAndFinalize(ABC):
         """
         raise NotImplementedError
 
+    @abstractmethod
+    def num_dispatchers(self) -> int:
+        raise NotImplementedError
+
 
 class FusedMoEPermuteExpertsUnpermute(ABC):
     """
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
index 45e813287d3fb..112305a4f2d00 100644
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -8,7 +8,7 @@ import torch
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.utils import (
-    moe_kernel_quantize_input)
+    _validate_scale_shape, moe_kernel_quantize_input)
 from vllm.utils import cdiv, round_up
 
 
@@ -32,16 +32,16 @@ def pplx_hidden_dim_scale_bytes(
         elem_size = torch.float32.itemsize
 
         if per_act_token_quant:
-            # per-token
+            # per-token (M x 1)
             assert block_shape is None
             hidden_scale_bytes = elem_size
         elif block_shape is not None:
-            # per-group
+            # per-group (M x K_tiles)
             block_size = block_shape[1]
             num_blocks = cdiv(hidden_dim, block_size)
             hidden_scale_bytes = num_blocks * elem_size
         else:
-            # per-tensor
+            # per-tensor (1 x 1)
             hidden_scale_bytes = elem_size
     else:
         hidden_dim_bytes = hidden_dim * in_dtype.itemsize
@@ -53,25 +53,22 @@ def pplx_hidden_dim_scale_bytes(
     )
 
 
-# The max_num_tokens, world_size and dp_size must be the same
-# as the ones used to create the AllToAll.
 class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
     def __init__(
         self,
         a2a: pplx.AllToAll,
         max_num_tokens: int,
-        world_size: int,
-        rank: int,
-        dp_size: int,
+        num_local_experts: int,
+        num_dispatchers: int,
     ):
         super().__init__()
         assert max_num_tokens > 0
+        assert num_local_experts > 0
         self.a2a = a2a
         self.max_num_tokens = max_num_tokens
-        self.world_size = world_size
-        self.rank = rank
-        self.dp_size = dp_size
+        self.num_local_experts = num_local_experts
+        self.num_dispatchers_ = num_dispatchers
 
     @property
     def activation_format(self) -> mk.FusedMoEActivationFormat:
@@ -83,6 +80,9 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     def topk_indices_dtype(self) -> Optional[torch.dtype]:
         return torch.uint32
 
+    def num_dispatchers(self) -> int:
+        return self.num_dispatchers_
+
     def prepare(
         self,
         a1: torch.Tensor,
@@ -120,42 +120,64 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             per_act_token_quant=quant_config.per_act_token_quant,
             block_shape=quant_config.block_shape)
 
-        if a1q_scale is not None:
-            if a1q_scale.numel() == 1:
-                orig_a_scale_block_shape = 1
-            else:
-                orig_a_scale_block_shape = a1q_scale.shape[-1]
-            a1q_scale = a1q_scale.repeat(repeat_rows, repeat_cols)
+        _validate_scale_shape(a1q, a1q_scale, quant_config.per_act_token_quant,
+                              quant_config.block_shape)
 
-        # rem_experts need to be 0 for pplx to work properly.
-        rem_experts = num_experts % self.world_size
-        assert rem_experts == 0
-        num_local_experts = ((num_experts // self.world_size) +
-                             (1 if self.rank < rem_experts else 0))
+        if a1q_scale is not None:
+            scalar_scales = a1q_scale.numel() == 1
+
+            # pplx requires 2-d scales even for scalar scales
+            if a1q_scale.dim() <= 1:
+                assert scalar_scales
+                a1q_scale = a1q_scale.view(1, 1)
+
+            orig_a_scale_block_shape = a1q_scale.shape[-1]
+
+            if not quant_config.is_block_quantized:
+                # TODO (bnell): use group_broadcast instead?
+                a1q_scale = a1q_scale.repeat(repeat_rows, repeat_cols)
+
+        assert a1q_scale is None or a1q_scale.ndim == 2, \
+            f"{0 if a1q_scale is None else (a1q_scale.ndim, a1q_scale.shape)}"
 
         expert_num_tokens = torch.empty(
-            num_local_experts,
+            self.num_local_experts,
             dtype=torch.int32,
             device=device,
         )
 
-        num_dp = self.world_size // self.dp_size
         expert_x = torch.empty(
-            (num_local_experts, self.max_num_tokens * num_dp, hidden_dim),
+            (self.num_local_experts,
+             self.max_num_tokens * self.num_dispatchers(), hidden_dim),
             dtype=a1q.dtype,
             device=device,
         )
 
         expert_x_scale: Optional[torch.Tensor] = None
         if a1q.dtype.itemsize == 1:
-            block_size = (quant_config.block_shape[1]
-                          if quant_config.block_shape is not None else 1)
+            if quant_config.is_per_act_token:
+                # (M x 1) -> (E x M x K)
+                final_dim = expert_x.size(2)
+            elif quant_config.is_per_tensor:
+                # (1 x 1) -> (E x 1 x 1)
+                final_dim = 1
+            else:
+                # (M x K_tiles) -> (E x M x K_tiles)
+                assert quant_config.block_shape is not None
+                num_blocks = cdiv(expert_x.size(2),
+                                  quant_config.block_shape[1])
+                final_dim = num_blocks
+
+            expert_x_scale_shape = (
+                self.num_local_experts,
+                expert_x.size(1),
+                round_up(final_dim, 4)  # round up for alignment
+            )
+
             expert_x_scale = torch.empty(
-                (num_local_experts, expert_x.size(1),
-                 round_up(
-                     (expert_x.size(2) + block_size - 1) // block_size, 4)),
+                expert_x_scale_shape,
                 dtype=torch.float32,
-                device=device,
+                device=expert_x.device,
             )
 
         # This argument is optional, defaults to indices.size(0)
@@ -171,8 +193,10 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             indices=topk_ids,
             bound_m=bound_m,
         )
+
         if expert_x_scale is not None:
             expert_x_scale = expert_x_scale[:, :, :orig_a_scale_block_shape]
+            assert expert_x_scale.ndim == 3
 
         return expert_x, expert_x_scale, expert_num_tokens, None, None
 
@@ -184,13 +208,16 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         topk_ids: torch.Tensor,
         apply_router_weight_on_input: bool,
     ) -> None:
-        num_tokens = output.size(0)  # M
         # This argument is optional
         # There's not much point setting this unless it is != topk_ids.size(0)
         bound_m: Optional[torch.Tensor] = None
 
-        assert topk_ids.size(0) == num_tokens, (
-            f"{topk_ids.size(0)} == {num_tokens}")
+        # TODO (bnell): fails in test_pplx_moe.py, figure out what's going on
+        #num_tokens = output.size(0)  # M
+        #assert topk_ids.size(0) == num_tokens, (
+        #    f"{topk_ids.size(0)} == {num_tokens}")
+        assert topk_ids.size() == topk_weights.size(), (
+            f"{topk_ids.size()} == {topk_weights.size()}")
         assert output.size(0) <= self.max_num_tokens, (
             f"{output.size(0)} <= {self.max_num_tokens}")
         assert output.size(1) == fused_expert_output.size(-1)
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
index 9e4be82f6c1f4..e1114efe5a3f0 100644
--- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -24,6 +24,9 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
     def topk_indices_dtype(self) -> Optional[torch.dtype]:
         return None
 
+    def num_dispatchers(self) -> int:
+        return 1
+
     def prepare(
         self,
         a1: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py
index 52346f7974405..a90cce719b484 100644
--- a/vllm/model_executor/layers/fused_moe/utils.py
+++ b/vllm/model_executor/layers/fused_moe/utils.py
@@ -99,9 +99,20 @@ def _fp8_perm(m: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
         return m[idx, ...]
 
 
-# TODO(bnell): better name
-def maybe_fix_scales(scales: Optional[torch.Tensor],
-                     num_experts: int) -> Optional[torch.Tensor]:
+def normalize_scales_shape(
+        scales: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    if scales is not None:
+        if scales.numel() == 1:
+            scales = scales.view(1, 1)
+        else:
+            scales = scales.view(-1, scales.size(-1))
+    return scales
+
+
+def normalize_batched_scales_shape(
+    scales: Optional[torch.Tensor],
+    num_experts: int,
+) -> Optional[torch.Tensor]:
     if scales is not None and scales.ndim < 3:
         if scales.numel() == 1:
             scales = scales.view(1)
@@ -111,3 +122,23 @@ def maybe_fix_scales(scales: Optional[torch.Tensor],
             scales = scales.view(num_experts, -1, scales.size(-1))
 
     return scales
+
+
+def _validate_scale_shape(
+    a: torch.Tensor,
+    a_scale: Optional[torch.Tensor],
+    per_act_token_quant: bool,
+    block_shape: Optional[list[int]],
+) -> None:
+    if a_scale is None:
+        return
+
+    if not per_act_token_quant and block_shape is None:
+        assert a_scale.numel() == 1, f"{a_scale.shape}"
+    elif per_act_token_quant:
+        assert a_scale.shape[0] == a.shape[0] and a_scale.shape[1] == 1, (
+            f"{a_scale.shape[0]} == {a.shape[0]} and {a_scale.shape[1]} == 1")
+    else:
+        assert block_shape is not None
+        expected = (a.shape[0], cdiv(a.shape[1], block_shape[1]))
+        assert a_scale.shape == expected, f"{a_scale.shape} == {expected}"
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 5d7e00c2b81be..b6dab4320ee3c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -573,6 +573,41 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             from vllm.model_executor.layers.fused_moe import fused_experts
             self.fused_experts_func = fused_experts
 
+    def select_gemm_impl(
+        self,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        moe: FusedMoEConfig,
+    ) -> FusedMoEPermuteExpertsUnpermute:
+        from vllm.model_executor.layers.fused_moe import TritonExperts
+        from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+            BatchedTritonExperts)
+
+        assert not self.rocm_aiter_moe_enabled and not self.use_marlin
+
+        logger.debug("BatchedTritonExperts(%s)", self.__class__.__name__)
+
+        if (prepare_finalize.activation_format ==
+                FusedMoEActivationFormat.BatchedExperts):
+            max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank(
+            )
+            assert max_num_tokens_per_rank is not None
+
+            return BatchedTritonExperts(
+                max_num_tokens=max_num_tokens_per_rank,
+                num_dispatchers=prepare_finalize.num_dispatchers(),
+                use_fp8_w8a8=True,
+                block_shape=self.quant_config.weight_block_size,
+                per_act_token_quant=(
+                    self.input_quant.strategy == QuantizationStrategy.TOKEN),
+            )
+        else:
+            return TritonExperts(
+                use_fp8_w8a8=True,
+                block_shape=self.quant_config.weight_block_size,
+                per_act_token_quant=(
+                    self.input_quant.strategy == QuantizationStrategy.TOKEN),
+            )
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -610,7 +645,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype,
+        )
 
         if self.rocm_aiter_moe_enabled:
             return self.rocm_aiter_fused_experts_func(
@@ -832,18 +869,25 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
         use_batched_format = (prepare_finalize.activation_format ==
                               FusedMoEActivationFormat.BatchedExperts)
 
+        num_dispatchers = prepare_finalize.num_dispatchers()
+
         num_experts = (moe.num_local_experts
                        if use_batched_format else moe.num_experts)
 
+        logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__)
+
         experts = CutlassExpertsFp8(
             num_experts,
             moe.in_dtype,
             self.input_quant.strategy == QuantizationStrategy.TOKEN,
             self.weight_quant.strategy == QuantizationStrategy.CHANNEL,
+            num_dispatchers=num_dispatchers,
             use_batched_format=use_batched_format,
         )
 
-        self.disable_expert_map = not experts.supports_expert_map()
+        self.disable_expert_map = (num_dispatchers > 1
+                                   or not experts.supports_expert_map())
+
         return experts
 
     def apply(
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index f879d0ad091a8..fef8f4d46d8ed 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -802,10 +802,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 self.quant_config.weight_block_size, False)
             return BatchedTritonOrDeepGemmExperts(
                 max_num_tokens=max_num_tokens_per_rank,
-                world_size=prepare_finalize.
-                world_size,  # type: ignore [attr-defined]
-                dp_size=prepare_finalize.
-                dp_size,  # type: ignore [attr-defined]
+                num_dispatchers=prepare_finalize.num_dispatchers(),
                 use_fp8_w8a8=True,
                 block_shape=self.quant_config.weight_block_size,
                 per_act_token_quant=False,
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 90a28192eccbc..ff182aadf7385 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -135,7 +135,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
         router_logits, _ = self.gate(hidden_states)
         final_hidden_states = self.experts(hidden_states=hidden_states,
                                            router_logits=router_logits)
-        final_hidden_states = final_hidden_states
+
         if self.tp_size > 1:
             final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(  # noqa E501
                 final_hidden_states)

From 8d775dd30a1461ffdb2ce019dff0bbed9c547ae8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 3 Jul 2025 23:56:09 +0200
Subject: [PATCH 083/167] [Misc] Fix `Unable to detect current VLLM config.
 Defaulting to NHD kv cache layout` warning (#20400)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 vllm/distributed/kv_transfer/kv_connector/utils.py | 4 ++--
 vllm/v1/attention/backends/utils.py                | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
index 493235d724f4e..5cbc8ca31752d 100644
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -97,10 +97,10 @@ def get_kv_connector_cache_layout():
     # used for faster transfer.
     vllm_config = get_current_vllm_config()
     kv_config = vllm_config.kv_transfer_config
-    if vllm_config.model_config is None or kv_config is None:
+    if kv_config is not None and vllm_config.model_config is None:
         logger.warning_once("Unable to detect current VLLM config. " \
         "Defaulting to NHD kv cache layout.")
-    else:
+    elif kv_config is not None:
         use_mla = vllm_config.model_config.use_mla
         if not use_mla and kv_config.kv_connector == "NixlConnector":
             logger.info_once("NixlConnector detected. Setting KV cache " \
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 8083f20026024..b0ebb00d9e6b9 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -138,7 +138,7 @@ def get_kv_cache_layout():
     if cache_layout is None:
         cache_layout = get_kv_connector_cache_layout()
     else:
-        logger.info_once("`FLASHINFER_KV_CACHE_LAYOUT` environment variable " \
+        logger.info_once("`VLLM_KV_CACHE_LAYOUT` environment variable " \
         "detected. Setting KV cache layout to %s.", cache_layout)
 
     return cache_layout

From 8d1096e7dbd82750d45b93506fa40fcc0c801309 Mon Sep 17 00:00:00 2001
From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com>
Date: Thu, 3 Jul 2025 15:08:12 -0700
Subject: [PATCH 084/167] [Bugfix] Register reducer even if
 transformers_modules not available (#19510)

Signed-off-by: Seiji Eicher <seiji@anyscale.com>
---
 tests/config/test_mp_reducer.py   | 57 +++++++++++++++++++++++++++++++
 vllm/transformers_utils/config.py | 31 +++++++++--------
 2 files changed, 73 insertions(+), 15 deletions(-)
 create mode 100644 tests/config/test_mp_reducer.py

diff --git a/tests/config/test_mp_reducer.py b/tests/config/test_mp_reducer.py
new file mode 100644
index 0000000000000..ee351cbfa7c16
--- /dev/null
+++ b/tests/config/test_mp_reducer.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import sys
+from unittest.mock import patch
+
+from vllm.config import VllmConfig
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.v1.engine.async_llm import AsyncLLM
+
+
+def test_mp_reducer(monkeypatch):
+    """
+    Test that _reduce_config reducer is registered when AsyncLLM is instantiated
+    without transformers_modules. This is a regression test for
+    https://github.com/vllm-project/vllm/pull/18640.
+    """
+
+    # Use V1 AsyncLLM which calls maybe_register_config_serialize_by_value
+    monkeypatch.setenv('VLLM_USE_V1', '1')
+
+    # Ensure transformers_modules is not in sys.modules
+    if 'transformers_modules' in sys.modules:
+        del sys.modules['transformers_modules']
+
+    with patch('multiprocessing.reducer.register') as mock_register:
+        engine_args = AsyncEngineArgs(
+            model="facebook/opt-125m",
+            max_model_len=32,
+            gpu_memory_utilization=0.1,
+            disable_log_stats=True,
+            disable_log_requests=True,
+        )
+
+        async_llm = AsyncLLM.from_engine_args(
+            engine_args,
+            start_engine_loop=False,
+        )
+
+        assert mock_register.called, (
+            "multiprocessing.reducer.register should have been called")
+
+        vllm_config_registered = False
+        for call_args in mock_register.call_args_list:
+            # Verify that a reducer for VllmConfig was registered
+            if len(call_args[0]) >= 2 and call_args[0][0] == VllmConfig:
+                vllm_config_registered = True
+
+                reducer_func = call_args[0][1]
+                assert callable(
+                    reducer_func), "Reducer function should be callable"
+                break
+
+        assert vllm_config_registered, (
+            "VllmConfig should have been registered to multiprocessing.reducer"
+        )
+
+        async_llm.shutdown()
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 3cc112790013b..5c422a9e3fce2 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -655,34 +655,35 @@ def maybe_register_config_serialize_by_value() -> None:
     """ # noqa
     try:
         import transformers_modules
+        transformers_modules_available = True
     except ImportError:
-        # the config does not need trust_remote_code
-        return
+        transformers_modules_available = False
 
     try:
-        import cloudpickle
-        cloudpickle.register_pickle_by_value(transformers_modules)
-
-        # ray vendors its own version of cloudpickle
-        from vllm.executor.ray_utils import ray
-        if ray:
-            ray.cloudpickle.register_pickle_by_value(transformers_modules)
-
-        # multiprocessing uses pickle to serialize arguments when using spawn
-        # Here we get pickle to use cloudpickle to serialize config objects
-        # that contain instances of the custom config class to avoid
-        # serialization problems if the generated module (and model) has a `.`
-        # in its name
         import multiprocessing
         import pickle
 
+        import cloudpickle
+
         from vllm.config import VllmConfig
 
+        # Register multiprocessing reducers to handle cross-process
+        # serialization of VllmConfig objects that may contain custom configs
+        # from transformers_modules
         def _reduce_config(config: VllmConfig):
             return (pickle.loads, (cloudpickle.dumps(config), ))
 
         multiprocessing.reducer.register(VllmConfig, _reduce_config)
 
+        # Register transformers_modules with cloudpickle if available
+        if transformers_modules_available:
+            cloudpickle.register_pickle_by_value(transformers_modules)
+
+            # ray vendors its own version of cloudpickle
+            from vllm.executor.ray_utils import ray
+            if ray:
+                ray.cloudpickle.register_pickle_by_value(transformers_modules)
+
     except Exception as e:
         logger.warning(
             "Unable to register remote classes used by"

From 7e1665b0899778ceeac0e4ab81e157709cf0b57e Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 4 Jul 2025 11:35:08 +0900
Subject: [PATCH 085/167] [Misc] Change warn_for_unimplemented_methods to debug
 (#20455)

---
 vllm/utils/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index ccfbf56927bdc..9550b056fbba9 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -2799,7 +2799,7 @@ def warn_for_unimplemented_methods(cls: type[T]) -> type[T]:
         if unimplemented_methods:
             method_names = ','.join(unimplemented_methods)
             msg = (f"Methods {method_names} not implemented in {self}")
-            logger.warning(msg)
+            logger.debug(msg)
 
     @wraps(original_init)
     def wrapped_init(self, *args, **kwargs) -> None:
@@ -3005,4 +3005,4 @@ def has_deep_ep() -> bool:
 def has_deep_gemm() -> bool:
     """Whether the optional `deep_gemm` package is available."""
 
-    return _has_module("deep_gemm")
\ No newline at end of file
+    return _has_module("deep_gemm")

From a4113b035cd4ee90ec02de0658b75d47f0007ede Mon Sep 17 00:00:00 2001
From: Gabriel Marinho <104592062+gmarinho2@users.noreply.github.com>
Date: Thu, 3 Jul 2025 23:50:17 -0300
Subject: [PATCH 086/167] [Platform] Add custom default max tokens (#18557)

Signed-off-by: Gabriel Marinho <gmarinho@ibm.com>
---
 vllm/entrypoints/openai/protocol.py           | 59 ++++---------------
 vllm/entrypoints/openai/serving_chat.py       | 18 ++++--
 vllm/entrypoints/openai/serving_completion.py | 16 +++--
 vllm/entrypoints/utils.py                     | 22 ++++++-
 vllm/platforms/interface.py                   |  4 ++
 5 files changed, 59 insertions(+), 60 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 3df11db33384b..93d9c588d8d28 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -229,7 +229,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
     logit_bias: Optional[dict[str, float]] = None
     logprobs: Optional[bool] = False
     top_logprobs: Optional[int] = 0
-    # TODO(#9845): remove max_tokens when field is removed from OpenAI API
     max_tokens: Optional[int] = Field(
         default=None,
         deprecated=
@@ -433,23 +432,10 @@ class ChatCompletionRequest(OpenAIBaseModel):
     }
 
     def to_beam_search_params(
-            self,
-            default_max_tokens: int,
-            default_sampling_params: Optional[dict] = None
-    ) -> BeamSearchParams:
-        # TODO(#9845): remove max_tokens when field is removed from OpenAI API
-        max_tokens = self.max_completion_tokens or self.max_tokens
+            self, max_tokens: int,
+            default_sampling_params: dict) -> BeamSearchParams:
 
-        if default_sampling_params is None:
-            default_sampling_params = {}
         n = self.n if self.n is not None else 1
-
-        # Use minimum of context window, user request & server limit.
-        max_tokens = min(
-            val for val in (default_max_tokens, max_tokens,
-                            default_sampling_params.get("max_tokens", None))
-            if val is not None)
-
         if (temperature := self.temperature) is None:
             temperature = default_sampling_params.get(
                 "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
@@ -465,21 +451,10 @@ class ChatCompletionRequest(OpenAIBaseModel):
 
     def to_sampling_params(
         self,
-        default_max_tokens: int,
+        max_tokens: int,
         logits_processor_pattern: Optional[str],
-        default_sampling_params: Optional[dict] = None,
+        default_sampling_params: dict,
     ) -> SamplingParams:
-        # TODO(#9845): remove max_tokens when field is removed from OpenAI API
-        max_tokens = self.max_completion_tokens or self.max_tokens
-
-        if default_sampling_params is None:
-            default_sampling_params = {}
-
-        # Use minimum of context window, user request & server limit.
-        max_tokens = min(
-            val for val in (default_max_tokens, max_tokens,
-                            default_sampling_params.get("max_tokens", None))
-            if val is not None)
 
         # Default parameters
         if (repetition_penalty := self.repetition_penalty) is None:
@@ -898,22 +873,15 @@ class CompletionRequest(OpenAIBaseModel):
     }
 
     def to_beam_search_params(
-            self,
-            default_max_tokens: int,
-            default_sampling_params: Optional[dict] = None
+        self,
+        max_tokens: int,
+        default_sampling_params: Optional[dict] = None,
     ) -> BeamSearchParams:
-        max_tokens = self.max_tokens
 
         if default_sampling_params is None:
             default_sampling_params = {}
         n = self.n if self.n is not None else 1
 
-        # Use minimum of context window, user request & server limit.
-        max_tokens = min(
-            val for val in (default_max_tokens, max_tokens,
-                            default_sampling_params.get("max_tokens", None))
-            if val is not None)
-
         if (temperature := self.temperature) is None:
             temperature = default_sampling_params.get("temperature", 1.0)
 
@@ -928,21 +896,14 @@ class CompletionRequest(OpenAIBaseModel):
 
     def to_sampling_params(
         self,
-        default_max_tokens: int,
+        max_tokens: int,
         logits_processor_pattern: Optional[str],
         default_sampling_params: Optional[dict] = None,
     ) -> SamplingParams:
-        max_tokens = self.max_tokens
 
         if default_sampling_params is None:
             default_sampling_params = {}
 
-        # Use minimum of context window, user request & server limit.
-        max_tokens = min(
-            val for val in (default_max_tokens, max_tokens,
-                            default_sampling_params.get("max_tokens", None))
-            if val is not None)
-
         # Default parameters
         if (repetition_penalty := self.repetition_penalty) is None:
             repetition_penalty = default_sampling_params.get(
@@ -1813,7 +1774,7 @@ class TranscriptionRequest(OpenAIBaseModel):
             self,
             default_max_tokens: int,
             default_sampling_params: Optional[dict] = None) -> SamplingParams:
-        # TODO(#9845): remove max_tokens when field is removed from OpenAI API
+
         max_tokens = default_max_tokens
 
         if default_sampling_params is None:
@@ -2029,7 +1990,7 @@ class TranslationRequest(OpenAIBaseModel):
             self,
             default_max_tokens: int,
             default_sampling_params: Optional[dict] = None) -> SamplingParams:
-        # TODO(#9845): remove max_tokens when field is removed from OpenAI API
+
         max_tokens = default_max_tokens
 
         if default_sampling_params is None:
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 299ade4e4d7d6..a802fbc3865f9 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -34,6 +34,7 @@ from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
     MistralToolCall)
+from vllm.entrypoints.utils import get_max_tokens
 from vllm.logger import init_logger
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
@@ -233,15 +234,22 @@ class OpenAIServingChat(OpenAIServing):
         try:
             for i, engine_prompt in enumerate(engine_prompts):
                 sampling_params: Union[SamplingParams, BeamSearchParams]
-                default_max_tokens = self.max_model_len - len(
-                    engine_prompt["prompt_token_ids"])
+
+                if self.default_sampling_params is None:
+                    self.default_sampling_params = {}
+
+                max_tokens = get_max_tokens(
+                    max_model_len=self.max_model_len,
+                    request=request,
+                    input_length=len(engine_prompt["prompt_token_ids"]),
+                    default_sampling_params=self.default_sampling_params)
+
                 if request.use_beam_search:
                     sampling_params = request.to_beam_search_params(
-                        default_max_tokens, self.default_sampling_params)
+                        max_tokens, self.default_sampling_params)
                 else:
                     sampling_params = request.to_sampling_params(
-                        default_max_tokens,
-                        self.model_config.logits_processor_pattern,
+                        max_tokens, self.model_config.logits_processor_pattern,
                         self.default_sampling_params)
 
                 self._log_inputs(request_id,
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 8171b491aafcc..6c9c29b714457 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -33,6 +33,7 @@ from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
                                                     is_text_tokens_prompt)
 # yapf: enable
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.utils import get_max_tokens
 from vllm.inputs.data import (EmbedsPrompt, TokensPrompt, is_embeds_prompt,
                               is_tokens_prompt)
 from vllm.logger import init_logger
@@ -160,15 +161,22 @@ class OpenAIServingCompletion(OpenAIServing):
                     input_length = len(engine_prompt["prompt_token_ids"])
                 else:
                     assert_never(engine_prompt)
-                default_max_tokens = self.max_model_len - input_length
+
+                if self.default_sampling_params is None:
+                    self.default_sampling_params = {}
+
+                max_tokens = get_max_tokens(
+                    max_model_len=self.max_model_len,
+                    request=request,
+                    input_length=input_length,
+                    default_sampling_params=self.default_sampling_params)
 
                 if request.use_beam_search:
                     sampling_params = request.to_beam_search_params(
-                        default_max_tokens, self.default_sampling_params)
+                        max_tokens, self.default_sampling_params)
                 else:
                     sampling_params = request.to_sampling_params(
-                        default_max_tokens,
-                        self.model_config.logits_processor_pattern,
+                        max_tokens, self.model_config.logits_processor_pattern,
                         self.default_sampling_params)
 
                 request_id_item = f"{request_id}-{i}"
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index 5b085e5b79478..423b99dbe565c 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -5,13 +5,17 @@ import argparse
 import asyncio
 import functools
 import os
-from typing import Any, Optional
+import sys
+from typing import Any, Optional, Union
 
 from fastapi import Request
 from fastapi.responses import JSONResponse, StreamingResponse
 from starlette.background import BackgroundTask, BackgroundTasks
 
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              CompletionRequest)
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
@@ -181,7 +185,6 @@ def _validate_truncation_size(
 
 def show_filtered_argument_or_group_from_help(parser: argparse.ArgumentParser,
                                               subcommand_name: list[str]):
-    import sys
 
     # Only handle --help=<keyword> for the current subcommand.
     # Since subparser_init() runs for all subcommands during CLI setup,
@@ -242,3 +245,18 @@ def show_filtered_argument_or_group_from_help(parser: argparse.ArgumentParser,
             print(f"\nNo group or parameter matching '{search_keyword}'")
             print("Tip: use `--help=listgroup` to view all groups.")
             sys.exit(1)
+
+
+def get_max_tokens(max_model_len: int, request: Union[ChatCompletionRequest,
+                                                      CompletionRequest],
+                   input_length: int, default_sampling_params: dict) -> int:
+
+    max_tokens = getattr(request, "max_completion_tokens",
+                         None) or request.max_tokens
+    default_max_tokens = max_model_len - input_length
+    max_output_tokens = current_platform.get_max_output_tokens(input_length)
+
+    return min(val
+               for val in (default_max_tokens, max_tokens, max_output_tokens,
+                           default_sampling_params.get("max_tokens"))
+               if val is not None)
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 0f08bf986333b..567d5cbf503fe 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -4,6 +4,7 @@ import enum
 import os
 import platform
 import random
+import sys
 from datetime import timedelta
 from platform import uname
 from typing import TYPE_CHECKING, NamedTuple, Optional, Union
@@ -164,6 +165,9 @@ class Platform:
     def is_out_of_tree(self) -> bool:
         return self._enum == PlatformEnum.OOT
 
+    def get_max_output_tokens(self, prompt_len: int) -> int:
+        return sys.maxsize
+
     def is_cuda_alike(self) -> bool:
         """Stateless version of [torch.cuda.is_available][]."""
         return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)

From 25950dca9b212743b755c8f1729a124db4f7f691 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= <wangzhipeng628@gmail.com>
Date: Fri, 4 Jul 2025 10:55:07 +0800
Subject: [PATCH 087/167] Add ignore consolidated file in mistral example code
 (#20420)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
---
 examples/offline_inference/vision_language.py             | 1 +
 examples/offline_inference/vision_language_multi_image.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index b136b14cd8ea3..bf7be33107dae 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -677,6 +677,7 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
         max_num_seqs=2,
         tensor_parallel_size=2,
         limit_mm_per_prompt={modality: 1},
+        ignore_patterns=["consolidated.safetensors"],
     )
 
     prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 13af8e9041947..eb4f3b6c8f449 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -505,6 +505,7 @@ def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
         max_num_seqs=2,
         tensor_parallel_size=2,
         limit_mm_per_prompt={"image": len(image_urls)},
+        ignore_patterns=["consolidated.safetensors"],
     )
 
     placeholders = "[IMG]" * len(image_urls)

From a7bab0c9e58c412cb7fa66c4f86de9911de26a3c Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Fri, 4 Jul 2025 11:33:44 +0800
Subject: [PATCH 088/167] [Misc] small update (#20462)

Signed-off-by: reidliu41 <reid201711@gmail.com>
---
 examples/offline_inference/profiling_tpu/README.md   |  5 ++++-
 examples/online_serving/structured_outputs/README.md | 10 +++++++---
 examples/others/tensorize_vllm_model.py              | 12 +++++++-----
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/examples/offline_inference/profiling_tpu/README.md b/examples/offline_inference/profiling_tpu/README.md
index e0122c05cff1f..8c9c1c92b6764 100644
--- a/examples/offline_inference/profiling_tpu/README.md
+++ b/examples/offline_inference/profiling_tpu/README.md
@@ -57,7 +57,10 @@ Once you have collected your profiles with this script, you can visualize them u
 Here are most likely the dependencies you need to install:
 
 ```bash
-pip install tensorflow-cpu tensorboard-plugin-profile etils importlib_resources
+pip install tensorflow-cpu \
+    tensorboard-plugin-profile \
+    etils \
+    importlib_resources
 ```
 
 Then you just need to point TensorBoard to the directory where you saved the profiles and visit `http://localhost:6006/` in your browser:
diff --git a/examples/online_serving/structured_outputs/README.md b/examples/online_serving/structured_outputs/README.md
index c9b97f11eefd7..d2777a43d4782 100644
--- a/examples/online_serving/structured_outputs/README.md
+++ b/examples/online_serving/structured_outputs/README.md
@@ -13,13 +13,15 @@ vllm serve Qwen/Qwen2.5-3B-Instruct
 To serve a reasoning model, you can use the following command:
 
 ```bash
-vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r1
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \
+    --reasoning-parser deepseek_r1
 ```
 
 If you want to run this script standalone with `uv`, you can use the following:
 
 ```bash
-uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs structured-output
+uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs \
+    structured-output
 ```
 
 See [feature docs](https://docs.vllm.ai/en/latest/features/structured_outputs.html) for more information.
@@ -44,7 +46,9 @@ uv run structured_outputs.py --stream
 Run certain constraints, for example `structural_tag` and `regex`, streaming:
 
 ```bash
-uv run structured_outputs.py --constraint structural_tag regex --stream
+uv run structured_outputs.py \
+    --constraint structural_tag regex \
+    --stream
 ```
 
 Run all constraints, with reasoning models and streaming:
diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py
index 9e1003a5c39d0..11233229561bc 100644
--- a/examples/others/tensorize_vllm_model.py
+++ b/examples/others/tensorize_vllm_model.py
@@ -202,7 +202,7 @@ def parse_args():
 
 
 
-def deserialize():
+def deserialize(args, tensorizer_config):
     if args.lora_path:
         tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
         llm = LLM(model=args.model,
@@ -242,7 +242,7 @@ def deserialize():
     return llm
 
 
-if __name__ == '__main__':
+def main():
     args = parse_args()
 
     s3_access_key_id = (getattr(args, 's3_access_key_id', None)
@@ -260,8 +260,6 @@ if __name__ == '__main__':
 
     model_ref = args.model
 
-    model_name = model_ref.split("/")[1]
-
     if args.command == "serialize" or args.command == "deserialize":
         keyfile = args.keyfile
     else:
@@ -309,6 +307,10 @@ if __name__ == '__main__':
                 encryption_keyfile = keyfile,
                 **credentials
             )
-        deserialize()
+        deserialize(args, tensorizer_config)
     else:
         raise ValueError("Either serialize or deserialize must be specified.")
+
+
+if __name__ == "__main__":
+    main()

From 4a98edff1fa5d0fb8f492d0ed899f6c9e347c1ad Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Fri, 4 Jul 2025 03:05:49 -0400
Subject: [PATCH 089/167] [Structured Outputs][V1] Skipping with models doesn't
 contain tokenizers (#20365)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/core/test_scheduler.py       | 57 ++++++++++++++++++++----
 tests/v1/engine/test_llm_engine.py    | 62 ++++++++++++++++++++++++---
 vllm/v1/engine/processor.py           |  5 +++
 vllm/v1/structured_output/__init__.py | 35 ++++++++-------
 4 files changed, 128 insertions(+), 31 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 652a556659fe3..02d2c83ab1584 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -9,7 +9,7 @@ import torch
 from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
                          SchedulerConfig, SpeculativeConfig, VllmConfig)
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
@@ -17,6 +17,7 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
+from vllm.v1.structured_output.request import StructuredOutputRequest
 
 EOS_TOKEN_ID = 50256
 
@@ -33,6 +34,7 @@ def create_scheduler(
     block_size: int = 16,
     max_model_len: Optional[int] = None,
     num_speculative_tokens: Optional[int] = None,
+    skip_tokenizer_init: bool = False,
 ) -> Scheduler:
     '''Create scheduler under test.
 
@@ -65,6 +67,7 @@ def create_scheduler(
         trust_remote_code=True,
         dtype="float16",
         seed=42,
+        skip_tokenizer_init=skip_tokenizer_init,
     )
     # Cache config, optionally force APC
     kwargs_cache = ({} if enable_prefix_caching is None else {
@@ -186,7 +189,7 @@ def test_get_num_unfinished_requests():
 ])
 def test_schedule(enable_prefix_caching: Optional[bool],
                   prompt_logprobs: Optional[int]):
-    '''Test scheduling. 
+    '''Test scheduling.
     Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs
     '''
     scheduler = create_scheduler(enable_prefix_caching=enable_prefix_caching)
@@ -1408,7 +1411,7 @@ def create_requests_with_priority(
 
 
 def test_priority_scheduling_basic_ordering():
-    """Test that requests are scheduled in priority order 
+    """Test that requests are scheduled in priority order
     (lower value = higher priority)."""
     scheduler = create_scheduler_with_priority()
 
@@ -1437,7 +1440,7 @@ def test_priority_scheduling_basic_ordering():
 
 
 def test_priority_scheduling_arrival_time_tiebreaker():
-    """Test that arrival time is used 
+    """Test that arrival time is used
     as tiebreaker when priorities are equal."""
     scheduler = create_scheduler_with_priority()
 
@@ -1495,7 +1498,7 @@ def test_priority_scheduling_mixed_priority_and_arrival():
 
 
 def test_priority_scheduling_preemption():
-    """Test that priority scheduling preempts 
+    """Test that priority scheduling preempts
     lower priority requests when memory is constrained."""
     # Create scheduler with very limited memory to force preemption
     scheduler = create_scheduler_with_priority(
@@ -1576,7 +1579,7 @@ def test_priority_scheduling_preemption():
 
 
 def test_priority_scheduling_no_preemption_when_space_available():
-    """Test that preemption doesn't happen 
+    """Test that preemption doesn't happen
     when there's space for new requests."""
     scheduler = create_scheduler_with_priority(
         max_num_seqs=3,  # Allow 3 concurrent requests
@@ -1626,7 +1629,7 @@ def test_priority_scheduling_no_preemption_when_space_available():
 
 
 def test_priority_scheduling_preemption_victim_selection():
-    """Test that the correct victim is selected for 
+    """Test that the correct victim is selected for
     preemption based on priority and arrival time."""
     # This test verifies the priority-based victim selection logic
     # by checking the waiting queue order after adding requests with different
@@ -1743,7 +1746,7 @@ def test_priority_scheduling_waiting_queue_order():
 
 
 def test_priority_scheduling_fcfs_fallback():
-    """Test that FCFS behavior is maintained when all 
+    """Test that FCFS behavior is maintained when all
     requests have same priority."""
     scheduler = create_scheduler_with_priority()
 
@@ -1811,7 +1814,7 @@ def test_priority_scheduling_with_limited_slots():
 
 
 def test_priority_scheduling_heap_property():
-    """Test that the waiting queue maintains heap 
+    """Test that the waiting queue maintains heap
     property for priority scheduling."""
     scheduler = create_scheduler_with_priority(
         max_num_seqs=1,  # Only one request can run at a time
@@ -1857,3 +1860,39 @@ def test_priority_scheduling_heap_property():
     # Verify requests were scheduled in priority order (lowest value first)
     expected_priorities = sorted(priorities)
     assert scheduled_priorities == expected_priorities
+
+
+def test_schedule_skip_tokenizer_init():
+    scheduler = create_scheduler(skip_tokenizer_init=True)
+    requests = create_requests(num_requests=5)
+    for request in requests:
+        scheduler.add_request(request)
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == len(requests)
+    assert output.grammar_bitmask is None
+
+
+def test_schedule_skip_tokenizer_init_structured_output_request():
+    scheduler = create_scheduler(skip_tokenizer_init=True)
+    guided_params = GuidedDecodingParams(regex="[0-9]+")
+    sampling_params = SamplingParams(
+        ignore_eos=False,
+        max_tokens=16,
+        guided_decoding=guided_params,
+    )
+    request = Request(
+        request_id="0",
+        prompt_token_ids=[0, 1],
+        multi_modal_inputs=None,
+        multi_modal_hashes=None,
+        multi_modal_placeholders=None,
+        sampling_params=sampling_params,
+        pooling_params=None,
+        eos_token_id=EOS_TOKEN_ID,
+        structured_output_request=StructuredOutputRequest(sampling_params),
+    )
+    scheduler.add_request(request)
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 0
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 1
diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
index 6284dcfb915bc..059106c62a204 100644
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -1,19 +1,30 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
 
 import random
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
 
 import pytest
 
-from vllm import LLM, SamplingParams
+from vllm import LLM
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Metric, Vector
 
+if TYPE_CHECKING:
+    from tests.conftest import VllmRunner
+
 MODEL = "facebook/opt-125m"
 DTYPE = "half"
 
 
-def _vllm_model(apc: bool, vllm_runner, monkeypatch):
+def _vllm_model(
+    apc: bool,
+    vllm_runner: type[VllmRunner],
+    monkeypatch: pytest.MonkeyPatch,
+    *,
+    skip_tokenizer_init: bool = False,
+):
     """Set up VllmRunner instance."""
     monkeypatch.setenv("VLLM_USE_V1", "1")
     return vllm_runner(
@@ -23,6 +34,7 @@ def _vllm_model(apc: bool, vllm_runner, monkeypatch):
         enforce_eager=True,
         enable_prefix_caching=apc,
         gpu_memory_utilization=0.5,
+        skip_tokenizer_init=skip_tokenizer_init,
     )
 
 
@@ -45,9 +57,27 @@ def vllm_model_apc(vllm_runner, monkeypatch):
         yield vllm_model
 
 
+@pytest.fixture(
+    # Function scope decouples tests & allows
+    # env var adjustment via monkeypatch
+    scope="function",
+    # Prefix caching
+    params=[False, True])
+def vllm_model_skip_tokenizer_init(vllm_runner, request, monkeypatch):
+    """VllmRunner test fixture with APC."""
+    with _vllm_model(
+            request.param,
+            vllm_runner,
+            monkeypatch,
+            skip_tokenizer_init=True,
+    ) as vllm_model:
+        yield vllm_model
+
+
 def _get_test_sampling_params(
     prompt_list: list[str],
     seed: Optional[int] = 42,
+    structured_outputs: bool = False,
 ) -> tuple[list[SamplingParams], list[int]]:
     """Generate random sampling params for a batch."""
 
@@ -62,14 +92,34 @@ def _get_test_sampling_params(
     n_list = [get_mostly_n_gt1() for _ in range(len(prompt_list))]
     # High temperature to maximize the chance of unique completions
     return [
-        SamplingParams(temperature=0.95, top_p=0.95, n=n, seed=seed)
-        for n in n_list
+        SamplingParams(
+            temperature=0.95,
+            top_p=0.95,
+            n=n,
+            seed=seed,
+            guided_decoding=GuidedDecodingParams(
+                regex="[0-9]+") if structured_outputs else None,
+        ) for n in n_list
     ], n_list
 
 
+def test_compatibility_with_skip_tokenizer_init(
+    vllm_model_skip_tokenizer_init: VllmRunner,
+    example_prompts: list[str],
+):
+    # Case 1: Structured output request should raise an error.
+    sampling_params_list, _ = _get_test_sampling_params(
+        example_prompts,
+        structured_outputs=True,
+    )
+    model: LLM = vllm_model_skip_tokenizer_init.model
+    with pytest.raises(ValueError):
+        _ = model.generate(example_prompts, sampling_params_list)
+
+
 def test_parallel_sampling(vllm_model, example_prompts) -> None:
     """Test passes if parallel sampling `n>1` yields `n` unique completions.
-    
+
     Args:
       vllm_model: VllmRunner instance under test.
       example_prompt: test fixture providing prompts for testing.
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 7e7703df2cf10..9fc52543efde8 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -152,6 +152,11 @@ class Processor:
         if not params.guided_decoding or not self.decoding_config:
             return
 
+        if self.model_config.skip_tokenizer_init and params.guided_decoding:
+            raise ValueError(
+                "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'"  # noqa: E501
+            )
+
         engine_level_backend = self.decoding_config.backend
         if params.guided_decoding.backend:
             # Request-level backend selection is not supported in V1.
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index c5500b9a384d0..839f1da8dd0d5 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -40,22 +40,25 @@ class StructuredOutputManager:
         self._grammar_bitmask: Optional[torch.Tensor] = None
         self._full_mask = torch.tensor(-1, dtype=torch.int32)
 
-        # The default max_workers if not specified is the number of CPUs * 5,
-        # which is way too high since these tasks are CPU-bound, not I/O bound.
-        # We also know we would never dominate CPU usage with just grammar
-        # compilation, so we set it to half the number of CPUs.
-        max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
-        self.executor = ThreadPoolExecutor(max_workers=max_workers)
-        self.tokenizer = init_tokenizer_from_configs(
-            model_config=self.vllm_config.model_config,
-            scheduler_config=self.vllm_config.scheduler_config,
-            lora_config=self.vllm_config.lora_config,
-        ).get_lora_tokenizer(None)
-        reasoning_backend = vllm_config.decoding_config.reasoning_backend
-        if reasoning_backend:
-            reasoner_cls = ReasoningParserManager.get_reasoning_parser(
-                reasoning_backend)
-            self.reasoner = reasoner_cls(tokenizer=self.tokenizer)
+        if not self.vllm_config.model_config.skip_tokenizer_init:
+            # The default max_workers if not specified is the number of
+            # CPUs * 5, which is way too high since these tasks are CPU-bound,
+            # not I/O bound. We also know we would never dominate CPU usage
+            # with just grammar compilation, so we set it to half the number
+            # of CPUs.
+            max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
+            self.executor = ThreadPoolExecutor(max_workers=max_workers)
+            self.tokenizer = init_tokenizer_from_configs(
+                model_config=self.vllm_config.model_config,
+                scheduler_config=self.vllm_config.scheduler_config,
+                lora_config=self.vllm_config.lora_config,
+            ).get_lora_tokenizer(None)
+            reasoning_backend = \
+                    self.vllm_config.decoding_config.reasoning_backend
+            if reasoning_backend:
+                reasoner_cls = ReasoningParserManager.get_reasoning_parser(
+                    reasoning_backend)
+                self.reasoner = reasoner_cls(tokenizer=self.tokenizer)
 
     def grammar_init(self, request: Request) -> None:
         if request.structured_output_request is None:

From 783921d889f942d6ec121e84d8eaa4704e5e0f27 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 4 Jul 2025 03:06:24 -0400
Subject: [PATCH 090/167] [Perf] Optimize Vectorization Utils for Int 8
 Quantization Kernels (#20331)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 .../compressed_tensors/int8_quant_kernels.cu  | 16 +--
 csrc/quantization/vectorization_utils.cuh     | 97 +++++++++++++++++++
 2 files changed, 106 insertions(+), 7 deletions(-)

diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
index 87117a165fe92..5cd2ac179768b 100644
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@@ -162,10 +162,11 @@ __global__ void dynamic_scaled_int8_quant_kernel(
 
   // calculate for absmax
   float thread_max = 0.f;
-  for (int i = tid; i < hidden_size; i += stride) {
-    const auto v = fabsf(static_cast<float>(row_in[i]));
-    thread_max = fmaxf(thread_max, v);
-  }
+  vectorize_read_with_alignment<16>(
+      row_in, hidden_size, tid, stride, [&] __device__(const scalar_t& src) {
+        const float v = fabsf(static_cast<float>(src));
+        thread_max = fmaxf(thread_max, v);
+      });
   using BlockReduce = cub::BlockReduce<float, 256>;
   __shared__ typename BlockReduce::TempStorage tmp;
   float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x);
@@ -232,9 +233,10 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
 
   // 1. calculate min & max
   MinMax thread_mm;
-  for (int i = tid; i < hidden_size; i += stride) {
-    thread_mm += static_cast<float>(row_in[i]);
-  }
+  vectorize_read_with_alignment<16>(row_in, hidden_size, tid, stride,
+                                    [&] __device__(const scalar_t& src) {
+                                      thread_mm += static_cast<float>(src);
+                                    });
 
   using BlockReduce = cub::BlockReduce<MinMax, 256>;
   __shared__ typename BlockReduce::TempStorage tmp;
diff --git a/csrc/quantization/vectorization_utils.cuh b/csrc/quantization/vectorization_utils.cuh
index 8d3c1d6d3b9fb..8aa0147df6ba8 100644
--- a/csrc/quantization/vectorization_utils.cuh
+++ b/csrc/quantization/vectorization_utils.cuh
@@ -27,6 +27,26 @@ __device__ inline void vectorize_with_alignment(
   constexpr int WIDTH = VEC_SIZE * sizeof(InT);  // eg: 64 B
   uintptr_t addr = reinterpret_cast<uintptr_t>(in);
 
+  // fast path when the whole region is already aligned
+  // Note: currently the output is guaranteed to be same as the input, so we
+  // don't check it here, comments here just for future reference.
+  bool can_vec = ((addr & (WIDTH - 1)) == 0) && ((len & (VEC_SIZE - 1)) == 0);
+  if (can_vec) {
+    int num_vec = len / VEC_SIZE;
+
+    using vin_t = vec_n_t<InT, VEC_SIZE>;
+    using vout_t = vec_n_t<OutT, VEC_SIZE>;
+    auto* v_in = reinterpret_cast<const vin_t*>(in);
+    auto* v_out = reinterpret_cast<vout_t*>(out);
+
+    for (int i = tid; i < num_vec; i += stride) {
+      vout_t tmp;
+      vec_op(tmp, v_in[i]);
+      v_out[i] = tmp;
+    }
+    return;
+  }
+
   int misalignment_offset = addr & (WIDTH - 1);       // addr % 64
   int alignment_bytes = WIDTH - misalignment_offset;  // 64 - (addr % 64)
   int prefix_elems = alignment_bytes & (WIDTH - 1);   // handle 64
@@ -72,4 +92,81 @@ __device__ __forceinline__ void vectorize_with_alignment(const InT* in,
                                      std::forward<ScaOp>(scalar_op));
 }
 
+template <int VEC_SIZE, typename InT, typename ScaOp>
+struct DefaultReadVecOp {
+  ScaOp scalar_op;
+
+  __device__ __forceinline__ void operator()(
+      const vec_n_t<InT, VEC_SIZE>& src) const {
+#pragma unroll
+    for (int i = 0; i < VEC_SIZE; ++i) {
+      scalar_op(src.val[i]);
+    }
+  }
+};
+
+// read-only version: iterate over the input with alignment guarantees
+template <int VEC_SIZE, typename InT, typename VecOp, typename ScaOp>
+__device__ inline void vectorize_read_with_alignment(const InT* in, int len,
+                                                     int tid, int stride,
+                                                     VecOp&& vec_op,
+                                                     ScaOp&& scalar_op) {
+  static_assert(VEC_SIZE > 0 && (VEC_SIZE & (VEC_SIZE - 1)) == 0,
+                "VEC_SIZE must be a positive power-of-two");
+  constexpr int WIDTH = VEC_SIZE * sizeof(InT);
+  uintptr_t addr = reinterpret_cast<uintptr_t>(in);
+
+  // fast path when the whole region is already aligned
+  bool can_vec = ((addr & (WIDTH - 1)) == 0) && ((len & (VEC_SIZE - 1)) == 0);
+  if (can_vec) {
+    int num_vec = len / VEC_SIZE;
+
+    using vin_t = vec_n_t<InT, VEC_SIZE>;
+    auto* v_in = reinterpret_cast<const vin_t*>(in);
+
+    for (int i = tid; i < num_vec; i += stride) {
+      vec_op(v_in[i]);
+    }
+    return;
+  }
+
+  int misalignment_offset = addr & (WIDTH - 1);
+  int alignment_bytes = WIDTH - misalignment_offset;
+  int prefix_elems = alignment_bytes & (WIDTH - 1);
+  prefix_elems /= sizeof(InT);
+  prefix_elems = min(prefix_elems, len);
+
+  // 1. handle the possibly unaligned prefix with scalar access.
+  for (int i = tid; i < prefix_elems; i += stride) {
+    scalar_op(in[i]);
+  }
+
+  in += prefix_elems;
+  len -= prefix_elems;
+
+  int num_vec = len / VEC_SIZE;
+  using vin_t = vec_n_t<InT, VEC_SIZE>;
+  auto* v_in = reinterpret_cast<const vin_t*>(in);
+
+  // 2. vectorized traversal of the main aligned region.
+  for (int i = tid; i < num_vec; i += stride) {
+    vec_op(v_in[i]);
+  }
+
+  // 3. handle remaining tail elements.
+  int tail_start = num_vec * VEC_SIZE;
+  for (int i = tid + tail_start; i < len; i += stride) {
+    scalar_op(in[i]);
+  }
+}
+
+// overload that requires only a scalar_op
+template <int VEC_SIZE, typename InT, typename ScaOp>
+__device__ __forceinline__ void vectorize_read_with_alignment(
+    const InT* in, int len, int tid, int stride, ScaOp&& scalar_op) {
+  using Vec = DefaultReadVecOp<VEC_SIZE, InT, std::decay_t<ScaOp>>;
+  vectorize_read_with_alignment<VEC_SIZE>(in, len, tid, stride, Vec{scalar_op},
+                                          std::forward<ScaOp>(scalar_op));
+}
+
 }  // namespace vllm

From 1caca5a5899a7f55209ccb402ccf55a804a09675 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 4 Jul 2025 15:40:42 +0800
Subject: [PATCH 091/167] [Misc] Add SPDX-FileCopyrightText (#20428)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 benchmarks/kernels/bench_fp8_gemm.py          |   1 +
 examples/offline_inference/spec_decode.py     |   1 +
 .../disagg_proxy_p2p_nccl_xpyd.py             |   1 +
 .../multi_instance_data_parallel.py           |   1 +
 ..._chat_completion_client_with_tools_xlam.py |   1 +
 ...letion_client_with_tools_xlam_streaming.py |   1 +
 tests/compile/test_fusion_attn.py             |   1 +
 tests/kernels/moe/parallel_utils.py           |   1 +
 tests/kernels/moe/test_deepep_deepgemm_moe.py |   1 +
 tests/kernels/moe/test_deepep_moe.py          |   1 +
 tests/kernels/moe/test_deepgemm.py            |   1 +
 .../test_apply_repetition_penalties.py        |   1 +
 tests/kernels/test_flex_attention.py          |   1 +
 .../models/language/pooling/test_intfloat.py  |   1 +
 tests/quantization/test_rtn.py                |   1 +
 tests/tool_use/test_minimax_tool_parser.py    |   1 +
 tests/tool_use/test_xlam_tool_parser.py       |   1 +
 tests/v1/sample/test_logits_processors.py     |   1 +
 tests/v1/test_request.py                      |   1 +
 .../v1/tpu/test_spmd_model_weight_loading.py  |   1 +
 tests/v1/tpu/test_tpu_qkv_linear.py           |   1 +
 tools/check_pickle_imports.py                 |   1 +
 tools/check_spdx_header.py                    | 147 ++++++++++++++----
 vllm/compilation/fusion_attn.py               |   1 +
 vllm/distributed/eplb/__init__.py             |   1 +
 vllm/distributed/eplb/eplb_state.py           |   1 +
 vllm/distributed/eplb/rebalance_algo.py       |   1 +
 vllm/distributed/eplb/rebalance_execute.py    |   1 +
 .../kv_connector/v1/p2p/p2p_nccl_connector.py |   1 +
 .../kv_connector/v1/p2p/p2p_nccl_engine.py    |   1 +
 .../kv_connector/v1/p2p/tensor_memory_pool.py |   1 +
 vllm/distributed/tpu_distributed_utils.py     |   1 +
 .../openai/tool_parsers/xlam_tool_parser.py   |   1 +
 .../layers/fused_moe/batched_deep_gemm_moe.py |   1 +
 .../batched_triton_or_deep_gemm_moe.py        |   1 +
 .../layers/fused_moe/cpu_fused_moe.py         |   1 +
 .../fused_moe/deepep_ht_prepare_finalize.py   |   1 +
 .../fused_moe/deepep_ll_prepare_finalize.py   |   1 +
 .../schemes/compressed_tensors_w4a4_nvfp4.py  |   1 +
 .../layers/quantization/deepgemm.py           |   1 +
 .../model_executor/layers/quantization/rtn.py |   1 +
 vllm/model_executor/model_loader/tpu.py       |   1 +
 vllm/model_executor/models/aya_vision.py      |   4 +-
 vllm/model_executor/models/dots1.py           |   1 +
 vllm/model_executor/models/glm4_1v.py         |   1 +
 vllm/model_executor/models/tarsier.py         |   1 +
 vllm/transformers_utils/configs/nemotron_h.py |   1 +
 vllm/v1/attention/backends/cpu_attn.py        |   1 +
 vllm/v1/attention/backends/flex_attention.py  |   1 +
 vllm/v1/attention/backends/mla/cutlass_mla.py |   1 +
 vllm/v1/attention/backends/rocm_aiter_fa.py   |   1 +
 vllm/v1/core/kv_cache_coordinator.py          |   1 +
 vllm/v1/pool/metadata.py                      |   1 +
 vllm/v1/sample/logits_processor.py            |   1 +
 vllm/v1/worker/cpu_model_runner.py            |   1 +
 vllm/v1/worker/cpu_worker.py                  |   1 +
 vllm/v1/worker/xpu_model_runner.py            |   1 +
 vllm/v1/worker/xpu_worker.py                  |   1 +
 58 files changed, 179 insertions(+), 28 deletions(-)

diff --git a/benchmarks/kernels/bench_fp8_gemm.py b/benchmarks/kernels/bench_fp8_gemm.py
index d17443871cf66..9209618990380 100644
--- a/benchmarks/kernels/bench_fp8_gemm.py
+++ b/benchmarks/kernels/bench_fp8_gemm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import copy
 import itertools
diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py
index 3f38aa9fcaa60..26e492fed25ff 100644
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from transformers import AutoTokenizer
 
diff --git a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
index 73f2caaa0dbde..4e82424d6cd7a 100644
--- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import socket
diff --git a/examples/online_serving/multi_instance_data_parallel.py b/examples/online_serving/multi_instance_data_parallel.py
index 62b1ec71af14d..cb230913a422f 100644
--- a/examples/online_serving/multi_instance_data_parallel.py
+++ b/examples/online_serving/multi_instance_data_parallel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 from typing import Optional
 
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_xlam.py b/examples/online_serving/openai_chat_completion_client_with_tools_xlam.py
index 3de5e2b544c88..f0b0a2db44ed2 100644
--- a/examples/online_serving/openai_chat_completion_client_with_tools_xlam.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_xlam.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: E501
 """
 Set up this example by starting a vLLM OpenAI-compatible server with tool call
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py b/examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py
index 5847414b1171a..94e664c9ec3d1 100644
--- a/examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: E501
 """
 Set up this example by starting a vLLM OpenAI-compatible server with tool call
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index 5e6679adfbdc9..37ec753bbc9e4 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import pytest
diff --git a/tests/kernels/moe/parallel_utils.py b/tests/kernels/moe/parallel_utils.py
index 98ae4c8cd34ef..f4049eb0d0956 100644
--- a/tests/kernels/moe/parallel_utils.py
+++ b/tests/kernels/moe/parallel_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 DeepEP test utilities
 """
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index 23eb5fcc94538..b74137eeaaa65 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Test DeepEP + DeepGEMM integration
 DeepGEMM are gemm kernels specialized for the
diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py
index 6446a8d9503e8..43804c410b6c2 100644
--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Test deepep dispatch-combine logic
 """
diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py
index 5d2690904cea2..fa62507179a20 100644
--- a/tests/kernels/moe/test_deepgemm.py
+++ b/tests/kernels/moe/test_deepgemm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Unit-test DeepGEMM FP8 kernels (no DeepEP).
 Compare DeepGEMM path against the Triton fallback inside vLLM's fused_experts.
diff --git a/tests/kernels/test_apply_repetition_penalties.py b/tests/kernels/test_apply_repetition_penalties.py
index 9115949a16514..5df52dc42f0a9 100644
--- a/tests/kernels/test_apply_repetition_penalties.py
+++ b/tests/kernels/test_apply_repetition_penalties.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 
diff --git a/tests/kernels/test_flex_attention.py b/tests/kernels/test_flex_attention.py
index 74d29e79d96c5..e25556c89fb9f 100644
--- a/tests/kernels/test_flex_attention.py
+++ b/tests/kernels/test_flex_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Integration tests for FlexAttention backend vs default backend"""
 
 import random
diff --git a/tests/models/language/pooling/test_intfloat.py b/tests/models/language/pooling/test_intfloat.py
index b6e83857fa70e..d899aaada2623 100644
--- a/tests/models/language/pooling/test_intfloat.py
+++ b/tests/models/language/pooling/test_intfloat.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 from ...utils import EmbedModelInfo
diff --git a/tests/quantization/test_rtn.py b/tests/quantization/test_rtn.py
index 04c1f98a709e2..133b2d9e4df69 100644
--- a/tests/quantization/test_rtn.py
+++ b/tests/quantization/test_rtn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright © 2025, Oracle and/or its affiliates.
 """Tests RTN quantization startup and generation, 
 doesn't test correctness
diff --git a/tests/tool_use/test_minimax_tool_parser.py b/tests/tool_use/test_minimax_tool_parser.py
index 0c9a574e03dc7..49b8e4b96f1bb 100644
--- a/tests/tool_use/test_minimax_tool_parser.py
+++ b/tests/tool_use/test_minimax_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: E501
 
 import json
diff --git a/tests/tool_use/test_xlam_tool_parser.py b/tests/tool_use/test_xlam_tool_parser.py
index dd154177bc8b4..8d26b90515901 100644
--- a/tests/tool_use/test_xlam_tool_parser.py
+++ b/tests/tool_use/test_xlam_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 
diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index a8e230a97ed54..84ee3b0392b40 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from collections.abc import Callable
diff --git a/tests/v1/test_request.py b/tests/v1/test_request.py
index 2dc90f83caba6..fb835747cfc67 100644
--- a/tests/v1/test_request.py
+++ b/tests/v1/test_request.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.v1.request import RequestStatus
 
 
diff --git a/tests/v1/tpu/test_spmd_model_weight_loading.py b/tests/v1/tpu/test_spmd_model_weight_loading.py
index 916325e41b922..ad234df0c8ed7 100644
--- a/tests/v1/tpu/test_spmd_model_weight_loading.py
+++ b/tests/v1/tpu/test_spmd_model_weight_loading.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
 import tempfile
 
diff --git a/tests/v1/tpu/test_tpu_qkv_linear.py b/tests/v1/tpu/test_tpu_qkv_linear.py
index b98570f01a7f2..46fa1193881fa 100644
--- a/tests/v1/tpu/test_tpu_qkv_linear.py
+++ b/tests/v1/tpu/test_tpu_qkv_linear.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import tempfile
 
 import numpy as np
diff --git a/tools/check_pickle_imports.py b/tools/check_pickle_imports.py
index b212df400b25c..ef197d1fbace1 100644
--- a/tools/check_pickle_imports.py
+++ b/tools/check_pickle_imports.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 import sys
 
diff --git a/tools/check_spdx_header.py b/tools/check_spdx_header.py
index 92914186b16e0..ced10ba9097bc 100644
--- a/tools/check_spdx_header.py
+++ b/tools/check_spdx_header.py
@@ -2,51 +2,146 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import sys
+from enum import Enum
 
-SPDX_HEADER = (
+
+class SPDXStatus(Enum):
+    """SPDX header status enumeration"""
+    EMPTY = "empty"  # empty __init__.py
+    COMPLETE = "complete"
+    MISSING_LICENSE = "missing_license"  # Only has copyright line
+    MISSING_COPYRIGHT = "missing_copyright"  # Only has license line
+    MISSING_BOTH = "missing_both"  # Completely missing
+
+
+FULL_SPDX_HEADER = (
     "# SPDX-License-Identifier: Apache-2.0\n"
     "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project")
-SPDX_HEADER_PREFIX = "# SPDX-License-Identifier:"
+
+LICENSE_LINE = "# SPDX-License-Identifier: Apache-2.0"
+COPYRIGHT_LINE = "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project"  # noqa: E501
 
 
-def check_spdx_header(file_path):
-    with open(file_path, encoding='UTF-8') as file:
+def check_spdx_header_status(file_path):
+    """Check SPDX header status of the file"""
+    with open(file_path, encoding="UTF-8") as file:
         lines = file.readlines()
         if not lines:
-            # Empty file like __init__.py
-            return True
-        for line in lines:
-            if line.strip().startswith(SPDX_HEADER_PREFIX):
-                return True
-    return False
+            # Empty file
+            return SPDXStatus.EMPTY
+
+        # Skip shebang line
+        start_idx = 0
+        if lines and lines[0].startswith("#!"):
+            start_idx = 1
+
+        has_license = False
+        has_copyright = False
+
+        # Check all lines for SPDX headers (not just the first two)
+        for i in range(start_idx, len(lines)):
+            line = lines[i].strip()
+            if line == LICENSE_LINE:
+                has_license = True
+            elif line == COPYRIGHT_LINE:
+                has_copyright = True
+
+        # Determine status based on what we found
+        if has_license and has_copyright:
+            return SPDXStatus.COMPLETE
+        elif has_license and not has_copyright:
+            # Only has license line
+            return SPDXStatus.MISSING_COPYRIGHT
+            # Only has copyright line
+        elif not has_license and has_copyright:
+            return SPDXStatus.MISSING_LICENSE
+        else:
+            # Completely missing both lines
+            return SPDXStatus.MISSING_BOTH
 
 
-def add_header(file_path):
-    with open(file_path, 'r+', encoding='UTF-8') as file:
+def add_header(file_path, status):
+    """Add or supplement SPDX header based on status"""
+    with open(file_path, "r+", encoding="UTF-8") as file:
         lines = file.readlines()
         file.seek(0, 0)
-        if lines and lines[0].startswith("#!"):
-            file.write(lines[0])
-            file.write(SPDX_HEADER + '\n')
-            file.writelines(lines[1:])
-        else:
-            file.write(SPDX_HEADER + '\n')
+        file.truncate()
+
+        if status == SPDXStatus.MISSING_BOTH:
+            # Completely missing, add complete header
+            if lines and lines[0].startswith("#!"):
+                # Preserve shebang line
+                file.write(lines[0])
+                file.write(FULL_SPDX_HEADER + "\n")
+                file.writelines(lines[1:])
+            else:
+                # Add header directly
+                file.write(FULL_SPDX_HEADER + "\n")
+                file.writelines(lines)
+
+        elif status == SPDXStatus.MISSING_COPYRIGHT:
+            # Only has license line, need to add copyright line
+            # Find the license line and add copyright line after it
+            for i, line in enumerate(lines):
+                if line.strip() == LICENSE_LINE:
+                    # Insert copyright line after license line
+                    lines.insert(
+                        i + 1,
+                        f"{COPYRIGHT_LINE}\n",
+                    )
+                    break
+
+            file.writelines(lines)
+
+        elif status == SPDXStatus.MISSING_LICENSE:
+            # Only has copyright line, need to add license line
+            # Find the copyright line and add license line before it
+            for i, line in enumerate(lines):
+                if line.strip() == COPYRIGHT_LINE:
+                    # Insert license line before copyright line
+                    lines.insert(i, f"{LICENSE_LINE}\n")
+                    break
             file.writelines(lines)
 
 
 def main():
-    files_with_missing_header = []
+    """Main function"""
+    files_missing_both = []
+    files_missing_copyright = []
+    files_missing_license = []
+
     for file_path in sys.argv[1:]:
-        if not check_spdx_header(file_path):
-            files_with_missing_header.append(file_path)
+        status = check_spdx_header_status(file_path)
 
-    if files_with_missing_header:
+        if status == SPDXStatus.MISSING_BOTH:
+            files_missing_both.append(file_path)
+        elif status == SPDXStatus.MISSING_COPYRIGHT:
+            files_missing_copyright.append(file_path)
+        elif status == SPDXStatus.MISSING_LICENSE:
+            files_missing_license.append(file_path)
+        else:
+            continue
+
+    # Collect all files that need fixing
+    all_files_to_fix = (files_missing_both + files_missing_copyright +
+                        files_missing_license)
+    if all_files_to_fix:
         print("The following files are missing the SPDX header:")
-        for file_path in files_with_missing_header:
-            print(f"  {file_path}")
-            add_header(file_path)
+        if files_missing_both:
+            for file_path in files_missing_both:
+                print(f"  {file_path}")
+                add_header(file_path, SPDXStatus.MISSING_BOTH)
 
-    sys.exit(1 if files_with_missing_header else 0)
+        if files_missing_copyright:
+            for file_path in files_missing_copyright:
+                print(f"  {file_path}")
+                add_header(file_path, SPDXStatus.MISSING_COPYRIGHT)
+        if files_missing_license:
+            for file_path in files_missing_license:
+                print(f"  {file_path}")
+                add_header(file_path, SPDXStatus.MISSING_LICENSE)
+
+    sys.exit(1 if all_files_to_fix else 0)
 
 
 if __name__ == "__main__":
diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py
index cf57e5ed282e2..79518b6f4f965 100644
--- a/vllm/compilation/fusion_attn.py
+++ b/vllm/compilation/fusion_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 import torch._inductor.pattern_matcher as pm
diff --git a/vllm/distributed/eplb/__init__.py b/vllm/distributed/eplb/__init__.py
index c87b039afd73d..80511024b9304 100644
--- a/vllm/distributed/eplb/__init__.py
+++ b/vllm/distributed/eplb/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 '''
 Expert parallelism load balancer (EPLB).
 '''
diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py
index 2185df865c1f6..6b0a126ca9b21 100644
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Expert parallelism load balancer (EPLB) metrics and states.
 
diff --git a/vllm/distributed/eplb/rebalance_algo.py b/vllm/distributed/eplb/rebalance_algo.py
index 7ad6d566b55bb..879b5b9f18240 100644
--- a/vllm/distributed/eplb/rebalance_algo.py
+++ b/vllm/distributed/eplb/rebalance_algo.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Expert parallelism load balancer (EPLB) for vLLM.
 
diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py
index cf173c734afd1..2ef8587b559be 100644
--- a/vllm/distributed/eplb/rebalance_execute.py
+++ b/vllm/distributed/eplb/rebalance_execute.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 The actual execution of the rearrangement.
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
index 2f870971ded70..52f589a6d7188 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
index 35c26897fe3f4..6c9ccb2e301e9 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import logging
 import os
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
index 303619a3fdd0f..02e3bc6274f60 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import atexit
 import ctypes
diff --git a/vllm/distributed/tpu_distributed_utils.py b/vllm/distributed/tpu_distributed_utils.py
index 36ab2eb3a62f6..0a786b4a1708f 100644
--- a/vllm/distributed/tpu_distributed_utils.py
+++ b/vllm/distributed/tpu_distributed_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import OrderedDict
 from typing import Optional
 
diff --git a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
index 6dd8336e52de9..aff6279c9bd82 100644
--- a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa
 import json
 from collections.abc import Sequence
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
index 1a1ccf0aaa856..a8788e340fc8f 100644
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
index 7d5c04f2560c8..0d67b4a4a6d63 100644
--- a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
index 68ce6bcccb5d4..e67ff66882102 100644
--- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Callable, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
index 37998334327fe..b625c28d40700 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import deep_ep
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
index 44d0a2b18b1dd..78ac4acc495da 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional, Union
 
 import deep_ep
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
index 65cbc49d2640a..8ba72162921a5 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Callable, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/deepgemm.py b/vllm/model_executor/layers/quantization/deepgemm.py
index e4cf647407584..5903976eaf6b9 100644
--- a/vllm/model_executor/layers/quantization/deepgemm.py
+++ b/vllm/model_executor/layers/quantization/deepgemm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import logging
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
index 7e7fd6d51fd32..68309716cf901 100644
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright © 2025, Oracle and/or its affiliates.
 
 import os
diff --git a/vllm/model_executor/model_loader/tpu.py b/vllm/model_executor/model_loader/tpu.py
index 6197bcdba826b..b44c165397d02 100644
--- a/vllm/model_executor/model_loader/tpu.py
+++ b/vllm/model_executor/model_loader/tpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import time
 from typing import Optional
 
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index 5214c248a40e9..45dd660c89375 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project Adapted from
-# https://github.com/huggingface/transformers/tree/main/src/transformers/models/aya_vision
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from https://github.com/huggingface/transformers/tree/main/src/transformers/models/aya_vision
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Literal, Optional, TypedDict, Union, cast
 
diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py
index 01a27d02a3044..4bdcbfabbbc26 100644
--- a/vllm/model_executor/models/dots1.py
+++ b/vllm/model_executor/models/dots1.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 298700e23e507..a3908e30ec6e3 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/main/src/transformers/models/Glm4v/modeling_Glm4v.py
diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py
index c2d3d0114c236..25f026e9bef8d 100644
--- a/vllm/model_executor/models/tarsier.py
+++ b/vllm/model_executor/models/tarsier.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
diff --git a/vllm/transformers_utils/configs/nemotron_h.py b/vllm/transformers_utils/configs/nemotron_h.py
index 9fe75f2dfeea8..457b3371e90db 100644
--- a/vllm/transformers_utils/configs/nemotron_h.py
+++ b/vllm/transformers_utils/configs/nemotron_h.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 HuggingFace Inc. team. All rights reserved.
 # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index 1c4604cc27e47..e493f1b8088bf 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import numpy as np
 import torch
 
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index dd8d7994ed333..f5407aaeb54f8 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with FlashAttention."""
 
 from dataclasses import dataclass
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index c8ec571989c68..db4b9c9537e5f 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index dc8ff22613061..63537384a1dae 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with AiterFlashAttention."""
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index 5620d9bee7a3b..b88a5990ca921 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from typing import Callable, Optional
 
diff --git a/vllm/v1/pool/metadata.py b/vllm/v1/pool/metadata.py
index d70a0d0446617..5f321cd87c524 100644
--- a/vllm/v1/pool/metadata.py
+++ b/vllm/v1/pool/metadata.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 from typing import Optional
 
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 9aa560d30eee6..16bd2b9ffd841 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import dataclasses
 from abc import ABC, abstractmethod
 from collections.abc import Iterator, Sequence
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index 370de9f115990..410a54e7466f6 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from contextlib import contextmanager
 from typing import Any
 
diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py
index 9a35e88120386..de575d604055f 100644
--- a/vllm/v1/worker/cpu_worker.py
+++ b/vllm/v1/worker/cpu_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from importlib import util
 from typing import Optional
diff --git a/vllm/v1/worker/xpu_model_runner.py b/vllm/v1/worker/xpu_model_runner.py
index 55d116dcd4968..4cedc913c2abc 100644
--- a/vllm/v1/worker/xpu_model_runner.py
+++ b/vllm/v1/worker/xpu_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import TYPE_CHECKING
 
 import torch
diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py
index d9ea03986566b..6d1f5749d8b2c 100644
--- a/vllm/v1/worker/xpu_worker.py
+++ b/vllm/v1/worker/xpu_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 
 import torch

From 0e3fe896e2a02e7ca74525c1a79f4cd428320eb6 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 4 Jul 2025 16:55:10 +0900
Subject: [PATCH 092/167] Support Llama 4 for fused_marlin_moe (#20457)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/model_executor/layers/fused_moe/fused_marlin_moe.py   | 6 ++++--
 vllm/model_executor/layers/quantization/awq_marlin.py      | 6 +-----
 .../compressed_tensors/compressed_tensors_moe.py           | 7 +++----
 vllm/model_executor/layers/quantization/fp8.py             | 3 +--
 vllm/model_executor/layers/quantization/gptq_marlin.py     | 5 +----
 vllm/model_executor/layers/quantization/modelopt.py        | 1 +
 6 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 40b76994f412c..1988c73ba7e2e 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -24,6 +24,7 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
                      topk_weights: torch.Tensor,
                      topk_ids: torch.Tensor,
                      quant_type_id: int,
+                     apply_router_weight_on_input: bool = False,
                      global_num_experts: int = -1,
                      expert_map: Optional[torch.Tensor] = None,
                      global_scale1: Optional[torch.Tensor] = None,
@@ -149,7 +150,7 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
         topk_weights,
         moe_block_size=block_size_m,
         top_k=topk,
-        mul_topk_weights=False,
+        mul_topk_weights=apply_router_weight_on_input,
         is_ep=expert_map is not None,
         b_q_type=quant_type,
         size_m=M,
@@ -182,7 +183,7 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
         topk_weights,
         moe_block_size=block_size_m,
         top_k=1,
-        mul_topk_weights=True,
+        mul_topk_weights=not apply_router_weight_on_input,
         is_ep=expert_map is not None,
         b_q_type=quant_type,
         size_m=M * topk,
@@ -208,6 +209,7 @@ def fused_marlin_moe_fake(hidden_states: torch.Tensor,
                           topk_weights: torch.Tensor,
                           topk_ids: torch.Tensor,
                           quant_type_id: int,
+                          apply_router_weight_on_input: bool = False,
                           global_num_experts: int = -1,
                           global_scale1: Optional[torch.Tensor] = None,
                           global_scale2: Optional[torch.Tensor] = None,
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index aff54bc495b2d..0fdded0b5a7fc 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -493,11 +493,6 @@ class AWQMoEMethod(FusedMoEMethodBase):
 
         assert activation == "silu", "Only SiLU activation is supported."
 
-        if apply_router_weight_on_input:
-            raise NotImplementedError(
-                "Apply router weight on input is not supported for"
-                "fused Marlin MoE method.")
-
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
@@ -520,6 +515,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
             topk_weights,
             topk_ids,
             quant_type_id=self.quant_type.id,
+            apply_router_weight_on_input=apply_router_weight_on_input,
             global_num_experts=global_num_experts,
             expert_map=expert_map,
             w1_zeros=layer.w13_qzeros,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index b6dab4320ee3c..48eeda5450b0b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -322,6 +322,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
                 global_scale1=layer.w13_weight_scale_2,
                 global_scale2=layer.w2_weight_scale_2,
                 quant_type_id=scalar_types.float4_e2m1f.id,
+                apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map)
 
@@ -669,8 +670,6 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         if self.use_marlin:
             assert activation == "silu", (
                 f"{activation} not supported for Marlin MoE.")
-            assert not apply_router_weight_on_input, (
-                "Apply router weight on input not supported for Marlin MoE.")
             return torch.ops.vllm.fused_marlin_moe(
                 x,
                 layer.w13_weight,
@@ -681,6 +680,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 topk_weights,
                 topk_ids,
                 quant_type_id=scalar_types.float8_e4m3fn.id,
+                apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map)
 
@@ -1356,8 +1356,6 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
 
         assert activation == "silu", (
             f"{activation} not supported for Marlin MoE.")
-        assert not apply_router_weight_on_input, (
-            "Apply router weight on input not supported for Marlin MoE.")
 
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
@@ -1381,6 +1379,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
             topk_weights,
             topk_ids,
             quant_type_id=self.quant_type.id,
+            apply_router_weight_on_input=apply_router_weight_on_input,
             global_num_experts=global_num_experts,
             expert_map=expert_map,
             g_idx1=layer.w13_weight_g_idx,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index fef8f4d46d8ed..a7d221780beec 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -889,8 +889,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         elif self.use_marlin:
             assert activation == "silu", (
                 f"{activation} not supported for Marlin MoE.")
-            assert not apply_router_weight_on_input, (
-                "Apply router weight on input not supported for Marlin MoE.")
             return torch.ops.vllm.fused_marlin_moe(
                 x,
                 layer.w13_weight,
@@ -901,6 +899,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 topk_weights,
                 topk_ids,
                 quant_type_id=scalar_types.float8_e4m3fn.id,
+                apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map)
         else:
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 48ab04c9ab37f..9bed5e2e48898 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -645,10 +645,6 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
                 "EPLB not supported for `GPTQMarlinMoEMethod` yet.")
 
         assert activation == "silu", "Only SiLU activation is supported."
-        if apply_router_weight_on_input:
-            raise NotImplementedError(
-                "Apply router weight on input is not supported for "
-                "fused Marlin MoE method.")
 
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
@@ -672,6 +668,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             topk_weights,
             topk_ids,
             quant_type_id=self.quant_type.id,
+            apply_router_weight_on_input=apply_router_weight_on_input,
             global_num_experts=global_num_experts,
             expert_map=expert_map,
             g_idx1=layer.w13_g_idx,
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index a10911b84afc4..9db875330230a 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -700,6 +700,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 global_scale1=layer.w13_weight_scale_2,
                 global_scale2=layer.w2_weight_scale_2,
                 quant_type_id=scalar_types.float4_e2m1f.id,
+                apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map)
 

From 9e5452ee341fd6d3d9556f539a257bf68328c9c7 Mon Sep 17 00:00:00 2001
From: sangbumlikeagod <98077576+sangbumlikeagod@users.noreply.github.com>
Date: Fri, 4 Jul 2025 20:28:07 +0900
Subject: [PATCH 093/167] [Bug][Frontend] Fix structure of transcription's
 decoder_prompt (#18809)

Signed-off-by: sangbumlikeagod <oironese@naver.com>
---
 .../openai/test_transcription_validation.py   | 39 +++++++++++++------
 vllm/model_executor/models/whisper.py         |  5 ++-
 2 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index dab14f1d7d03f..e1d175d9c6e12 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -37,7 +37,6 @@ async def test_basic_audio(mary_had_lamb):
     model_name = "openai/whisper-large-v3-turbo"
     server_args = ["--enforce-eager"]
     # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
-    prompt = "THE FIRST WORDS I SPOKE"
     with RemoteOpenAIServer(model_name, server_args) as remote_server:
         client = remote_server.get_async_client()
         transcription = await client.audio.transcriptions.create(
@@ -48,16 +47,6 @@ async def test_basic_audio(mary_had_lamb):
             temperature=0.0)
         out = json.loads(transcription)['text']
         assert "Mary had a little lamb," in out
-        # This should "force" whisper to continue prompt in all caps
-        transcription_wprompt = await client.audio.transcriptions.create(
-            model=model_name,
-            file=mary_had_lamb,
-            language="en",
-            response_format="text",
-            prompt=prompt,
-            temperature=0.0)
-        out_capital = json.loads(transcription_wprompt)['text']
-        assert prompt not in out_capital
 
 
 @pytest.mark.asyncio
@@ -238,3 +227,31 @@ async def test_sampling_params(mary_had_lamb):
             extra_body=dict(seed=42))
 
         assert greedy_transcription.text != transcription.text
+
+
+@pytest.mark.asyncio
+async def test_audio_prompt(mary_had_lamb):
+    model_name = "openai/whisper-large-v3-turbo"
+    server_args = ["--enforce-eager"]
+    prompt = "This is a speech, recorded in a phonograph."
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        #Prompts should not omit the part of original prompt while transcribing.
+        prefix = "The first words I spoke in the original phonograph"
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=model_name,
+            file=mary_had_lamb,
+            language="en",
+            response_format="text",
+            temperature=0.0)
+        out = json.loads(transcription)['text']
+        assert prefix in out
+        transcription_wprompt = await client.audio.transcriptions.create(
+            model=model_name,
+            file=mary_had_lamb,
+            language="en",
+            response_format="text",
+            prompt=prompt,
+            temperature=0.0)
+        out_prompt = json.loads(transcription_wprompt)['text']
+        assert prefix in out_prompt
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 27b3c75513fbf..344d6fc8f452f 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -780,8 +780,9 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
     @classmethod
     def get_decoder_prompt(cls, language: str, task_type: str,
                            prompt: str) -> str:
-        return (f"<|startoftranscript|><|{language}|><|{task_type}|>"
-                f"<|notimestamps|>{prompt}")
+        return ((f"<|prev|>{prompt}" if prompt else "") +
+                f"<|startoftranscript|><|{language}|>" +
+                f"<|{task_type}|><|notimestamps|>")
 
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:

From 2e26f9156ac1eb4cc402221ffb7aa98fe10d9907 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Fri, 4 Jul 2025 20:47:39 +0800
Subject: [PATCH 094/167] [Model][3/N] Automatic conversion of CrossEncoding
 model (#20168)

Signed-off-by: wang.yuqi <noooop@126.com>
---
 docs/models/supported_models.md               |  21 +++-
 .../models/language/pooling/test_embedding.py |  10 +-
 tests/models/language/pooling/test_gte.py     |  16 ++-
 .../language/pooling/test_mxbai_rerank.py     |  84 +++++++++++++
 vllm/config.py                                |  13 +-
 vllm/model_executor/models/adapters.py        | 102 ++++++++++++++-
 vllm/model_executor/models/config.py          |   2 +-
 vllm/model_executor/models/qwen3.py           | 119 +-----------------
 8 files changed, 234 insertions(+), 133 deletions(-)
 create mode 100644 tests/models/language/pooling/test_mxbai_rerank.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index eb32aa361efde..cf062311b00c9 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -477,12 +477,20 @@ If your model is not in the above list, we will try to automatically convert the
 
 Specified using `--task score`.
 
-| Architecture                          | Models            | Example HF Models                                                                    | [V1](gh-issue:8779)   |
-|---------------------------------------|-------------------|--------------------------------------------------------------------------------------|-----------------------|
-| `BertForSequenceClassification`       | BERT-based        | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.                                         |                       |
-| `Qwen3ForSequenceClassification`      | Qwen3-based       | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎                     |
-| `RobertaForSequenceClassification`    | RoBERTa-based     | `cross-encoder/quora-roberta-base`, etc.                                             |                       |
-| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc.                                                      |                       |
+| Architecture                          | Models            | Example HF Models                                                                    | [V1](gh-issue:8779) |
+|---------------------------------------|-------------------|--------------------------------------------------------------------------------------|---------------------|
+| `BertForSequenceClassification`       | BERT-based        | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.                                         |                     |
+| `Qwen2ForSequenceClassification`      | Qwen2-based       | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc.                                | ✅︎                  |
+| `Qwen3ForSequenceClassification`      | Qwen3-based       | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎                  |
+| `RobertaForSequenceClassification`    | RoBERTa-based     | `cross-encoder/quora-roberta-base`, etc.                                             |                     |
+| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc.                                                      |                     |
+
+!!! note
+    Load the official original `mxbai-rerank-v2` by using the following command.
+
+    ```bash
+    vllm serve mixedbread-ai/mxbai-rerank-base-v2 --hf_overrides '{"architectures": ["Qwen2ForSequenceClassification"],"classifier_from_token": ["0", "1"], "method": "from_2_way_softmax"}'
+    ```
 
 !!! note
     Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: <gh-file:examples/offline_inference/qwen3_reranker.py>.
@@ -490,6 +498,7 @@ Specified using `--task score`.
     ```bash
     vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
     ```
+
 [](){ #supported-mm-models }
 
 ## List of Multimodal Language Models
diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
index b8b17524cf07f..05fcf4101ff9a 100644
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
+from typing import Optional
 
 import pytest
 
@@ -74,6 +75,13 @@ def test_models(
         vllm_extra_kwargs["override_pooler_config"] = \
             PoolerConfig(pooling_type="MEAN", normalize=False)
 
+    max_model_len: Optional[int] = 512
+    if model in [
+            "sentence-transformers/all-MiniLM-L12-v2",
+            "sentence-transformers/stsb-roberta-base-v2"
+    ]:
+        max_model_len = None
+
     # The example_prompts has ending "\n", for example:
     # "Write a short story about a robot that dreams for the first time.\n"
     # sentence_transformers will strip the input texts, see:
@@ -87,7 +95,7 @@ def test_models(
 
     with vllm_runner(model,
                      task="embed",
-                     max_model_len=512,
+                     max_model_len=max_model_len,
                      **vllm_extra_kwargs) as vllm_model:
         vllm_outputs = vllm_model.embed(example_prompts)
 
diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py
index 6a3a0f150b6d7..0ad54785308e8 100644
--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling/test_gte.py
@@ -56,10 +56,16 @@ MODELS = [
                    enable_test=False),
 ]
 
+V1FlashAttentionImpNotSupported = [
+    "Alibaba-NLP/gte-Qwen2-1.5B-instruct", "Alibaba-NLP/gte-modernbert-base"
+]
+
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner,
-                           model_info: EmbedModelInfo) -> None:
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo,
+                           monkeypatch) -> None:
+    if model_info.name in V1FlashAttentionImpNotSupported:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
 
     vllm_extra_kwargs: dict[str, Any] = {}
     if model_info.architecture == "GteNewModel":
@@ -71,8 +77,10 @@ def test_embed_models_mteb(hf_runner, vllm_runner,
 
 @pytest.mark.parametrize("model_info", MODELS)
 def test_embed_models_correctness(hf_runner, vllm_runner,
-                                  model_info: EmbedModelInfo,
-                                  example_prompts) -> None:
+                                  model_info: EmbedModelInfo, example_prompts,
+                                  monkeypatch) -> None:
+    if model_info.name in V1FlashAttentionImpNotSupported:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
 
     vllm_extra_kwargs: dict[str, Any] = {}
     if model_info.architecture == "GteNewModel":
diff --git a/tests/models/language/pooling/test_mxbai_rerank.py b/tests/models/language/pooling/test_mxbai_rerank.py
new file mode 100644
index 0000000000000..a1293a95bfd5f
--- /dev/null
+++ b/tests/models/language/pooling/test_mxbai_rerank.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import pytest
+import torch
+
+from tests.conftest import HfRunner
+
+from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
+
+RERANK_MODELS = [
+    RerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
+                    architecture="Qwen2ForSequenceClassification",
+                    dtype="float32",
+                    enable_test=True),
+    RerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
+                    architecture="Qwen2ForSequenceClassification",
+                    dtype="float32",
+                    enable_test=False)
+]
+
+
+class MxbaiRerankerHfRunner(HfRunner):
+
+    def __init__(self,
+                 model_name: str,
+                 dtype: str = "auto",
+                 *args: Any,
+                 **kwargs: Any) -> None:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                                       padding_side='left')
+        self.yes_loc = self.tokenizer.convert_tokens_to_ids("1")
+        self.no_loc = self.tokenizer.convert_tokens_to_ids("0")
+
+    def predict(self, prompts: list[list[str]], *args,
+                **kwargs) -> torch.Tensor:
+
+        def process_inputs(pairs):
+            inputs = self.tokenizer(pairs,
+                                    padding=False,
+                                    truncation='longest_first',
+                                    return_attention_mask=False)
+            for i, ele in enumerate(inputs['input_ids']):
+                inputs['input_ids'][i] = ele
+            inputs = self.tokenizer.pad(inputs,
+                                        padding=True,
+                                        return_tensors="pt")
+            for key in inputs:
+                inputs[key] = inputs[key].to(self.model.device)
+            return inputs
+
+        @torch.no_grad()
+        def compute_logits(inputs):
+            logits = self.model(**inputs).logits[:, -1, :]
+            yes_logits = logits[:, self.yes_loc]
+            no_logits = logits[:, self.no_loc]
+            logits = yes_logits - no_logits
+            scores = logits.float().sigmoid()
+            return scores
+
+        scores = []
+        for prompt in prompts:
+            inputs = process_inputs([prompt])
+            score = compute_logits(inputs)
+            scores.append(score[0].item())
+        return torch.Tensor(scores)
+
+
+@pytest.mark.parametrize("model_info", RERANK_MODELS)
+def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
+    vllm_extra_kwargs: dict[str, Any] = {}
+    if model_info.architecture == "Qwen2ForSequenceClassification":
+        vllm_extra_kwargs["hf_overrides"] = {
+            "architectures": ["Qwen2ForSequenceClassification"],
+            "classifier_from_token": ["0", "1"],
+            "method": "from_2_way_softmax",
+        }
+
+    mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info,
+                            vllm_extra_kwargs)
diff --git a/vllm/config.py b/vllm/config.py
index 226a1014fa720..a1d8c32953b04 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -466,6 +466,9 @@ class ModelConfig:
                     "affect the random state of the Python process that "
                     "launched vLLM.", self.seed)
 
+        # Keep set served_model_name before maybe_model_redirect(self.model)
+        self.served_model_name = get_served_model_name(self.model,
+                                                       self.served_model_name)
         self.model = maybe_model_redirect(self.model)
         # The tokenizer is consistent with the model by default.
         if self.tokenizer is None:
@@ -609,8 +612,6 @@ class ModelConfig:
 
         self.original_max_model_len = self.max_model_len
         self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
-        self.served_model_name = get_served_model_name(self.model,
-                                                       self.served_model_name)
         self.multimodal_config = self._init_multimodal_config()
         if not self.skip_tokenizer_init:
             self._verify_tokenizer_mode()
@@ -1420,7 +1421,7 @@ class ModelConfig:
 
     @property
     def is_cross_encoder(self) -> bool:
-        return self.registry.is_cross_encoder_model(self.architectures)
+        return self.task == "classify"
 
     @property
     def use_mla(self) -> bool:
@@ -4762,6 +4763,12 @@ class VllmConfig:
         if cls is not None:
             cls.verify_and_update_config(self)
 
+        if self.model_config.task == "classify":
+            # Maybe convert ForCausalLM into ForSequenceClassification model.
+            from vllm.model_executor.models.adapters import (
+                SequenceClassificationConfig)
+            SequenceClassificationConfig.verify_and_update_config(self)
+
     def __str__(self):
         return (
             f"model={self.model_config.model!r},"
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 4611f6704e19b..78d86f6f20445 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -2,14 +2,17 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
-from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
 
 import torch
 import torch.nn as nn
 
+from vllm.model_executor.models.config import VerifyAndUpdateConfig
+
 from .interfaces_base import VllmModelForPooling, is_pooling_model
 
 if TYPE_CHECKING:
+    from vllm.config import VllmConfig
     from vllm.model_executor.layers.pooler import PoolingType
 
 _T = TypeVar("_T", bound=type[nn.Module])
@@ -39,7 +42,6 @@ def _create_pooling_model_cls(
     default_softmax: bool,
 ) -> _T:
     # Lazy import
-    from vllm.config import VllmConfig
     from vllm.model_executor.layers.pooler import Pooler, PoolerOutput
     from vllm.model_executor.pooling_metadata import PoolingMetadata
 
@@ -162,7 +164,6 @@ def as_seq_cls_model(cls: _T) -> _T:
         return cls
 
     # Lazy import
-    from vllm.config import VllmConfig
     from vllm.model_executor.layers.linear import RowParallelLinear
     from vllm.model_executor.layers.pooler import PoolerOutput, PoolingType
     from vllm.model_executor.models.interfaces import SupportsCrossEncoding
@@ -193,6 +194,7 @@ def as_seq_cls_model(cls: _T) -> _T:
             config = vllm_config.model_config.hf_config
             quant_config = vllm_config.quant_config
 
+            self.vllm_config = vllm_config
             self.task = vllm_config.model_config.task
             self.pooling_type = (
                 vllm_config.model_config.pooler_config.pooling_type)
@@ -242,6 +244,17 @@ def as_seq_cls_model(cls: _T) -> _T:
                 ]
                 return PoolerOutput(outputs=pooled_outputs)
 
+        def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+            tokens = getattr(self.config, "classifier_from_token", None)
+            method = getattr(self.config, "method", None)
+
+            if tokens is None and method is None:
+                return super().load_weights(weights)
+            else:
+                # Online convert ForCausalLM into
+                # ForSequenceClassification model.
+                return seq_cls_model_loader(self, weights)
+
 
     ModelForSequenceClassification.__name__ = \
         _get_pooling_model_name(cls.__name__, "ForSequenceClassification")
@@ -277,3 +290,86 @@ def as_reward_model(cls: _T) -> _T:
         _get_pooling_model_name(cls.__name__, "ForReward")
 
     return ModelForReward  # type: ignore
+
+
+class SequenceClassificationConfig(VerifyAndUpdateConfig):
+
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        config = vllm_config.model_config.hf_config
+        method = getattr(config, "method", None)
+        tokens = getattr(config, "classifier_from_token", None)
+
+        if method is None:
+            return
+
+        assert tokens is not None
+        assert method in SEQ_CLS_LOAD_METHODS, f"method {method} not supported"
+
+        if method == "from_2_way_softmax":
+            assert len(tokens) == 2
+            config.num_labels = 1
+        else:
+            config.num_labels = len(tokens)
+
+
+def load_weights_using_from_2_way_softmax(
+        model, weights: Iterable[tuple[str, torch.Tensor]]):
+    # refer to https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
+    from vllm.model_executor.layers.vocab_parallel_embedding import (
+        ParallelLMHead)
+    from vllm.model_executor.models.utils import AutoWeightsLoader
+
+    model_config = model.vllm_config.model_config
+    tokens = getattr(model.config, "classifier_from_token", [])
+    tokens = cast(list[int], tokens)
+    assert len(tokens) == 2
+
+    device = model.score.weight.device
+
+    if model.config.tie_word_embeddings:
+        model.lm_head = model.model.embed_tokens
+    else:
+        model.lm_head = ParallelLMHead(model.config.vocab_size,
+                                       model.config.hidden_size,
+                                       quant_config=model.quant_config)
+
+    loader = AutoWeightsLoader(model)
+    loaded_weights = loader.load_weights(weights)
+
+    from vllm.transformers_utils.tokenizer import get_tokenizer
+    tokenizer = get_tokenizer(model_config.tokenizer,
+                              revision=model_config.tokenizer_revision,
+                              tokenizer_mode=model_config.tokenizer_mode,
+                              trust_remote_code=model_config.trust_remote_code)
+
+    false_id = tokenizer.convert_tokens_to_ids(tokens[0])
+    true_id = tokenizer.convert_tokens_to_ids(tokens[1])
+    weight = model.lm_head.weight.data[true_id].to(device).to(
+        torch.float32) - model.lm_head.weight.data[false_id].to(device).to(
+            torch.float32)
+    model.score.weight.data.copy_(weight)
+
+    del model.lm_head
+    loaded_weights.add("score.weight")
+    loaded_weights.discard("lm_head.weight")
+    return loaded_weights
+
+
+SEQ_CLS_LOAD_METHODS = {
+    "from_2_way_softmax": load_weights_using_from_2_way_softmax,
+}
+
+
+def seq_cls_model_loader(model, weights: Iterable[tuple[str, torch.Tensor]]):
+    # Online convert ForCausalLM into ForSequenceClassification model.
+    # - from_2_way_softmax:
+    #   - Qwen3ForCausalLM
+    #     - Qwen3-Reranker
+    #   - Qwen2ForCausalLM
+    #     - mxbai-rerank-v2
+
+    config = model.vllm_config.model_config.hf_config
+    method = getattr(config, "method", None)
+    assert method in SEQ_CLS_LOAD_METHODS, f"method {method} not supported"
+    return SEQ_CLS_LOAD_METHODS[method](model, weights)
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 7b5345704ad00..552c4b0742164 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -167,7 +167,7 @@ class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig):
         assert tokens is not None and len(tokens) == 2, \
             ("Try loading the original Qwen3 Reranker?, see: "
              "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/qwen3_reranker.py")
-        config.num_labels = 1
+        vllm_config.model_config.hf_config.method = "from_2_way_softmax"
 
 
 class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig):
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index 1224ba7abc756..de99a76f2897f 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -38,15 +38,14 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
-from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors, PoolerOutput
+from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsCrossEncoding, SupportsLoRA, SupportsPP
+from .adapters import as_seq_cls_model
+from .interfaces import SupportsLoRA, SupportsPP
 from .qwen2 import Qwen2MLP as Qwen3MLP
 from .qwen2 import Qwen2Model
 from .utils import AutoWeightsLoader, PPMissingLayer, maybe_prefix
@@ -323,114 +322,4 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         return loader.load_weights(weights)
 
 
-class Qwen3ForSequenceClassification(nn.Module, SupportsLoRA,
-                                     SupportsCrossEncoding):
-
-    def __init__(
-        self,
-        vllm_config: "VllmConfig",
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        pooler_config = vllm_config.model_config.pooler_config
-
-        self.vllm_config = vllm_config
-        self.config = config
-        self.quant_config = quant_config
-        self.prefix = prefix
-        self.model = Qwen3Model(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
-        self.score = RowParallelLinear(config.hidden_size,
-                                       config.num_labels,
-                                       quant_config=quant_config,
-                                       input_is_parallel=False,
-                                       bias=False,
-                                       prefix=maybe_prefix(prefix, "score"))
-
-        self._pooler = Pooler.from_config_with_defaults(
-            pooler_config,
-            pooling_type=PoolingType.LAST,
-            normalize=False,
-            softmax=True)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        return self.model(input_ids=input_ids,
-                          positions=positions,
-                          inputs_embeds=inputs_embeds,
-                          intermediate_tensors=intermediate_tensors)
-
-    def pooler(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> Optional[PoolerOutput]:
-        hidden_states = self._pooler.extract_states(hidden_states,
-                                                    pooling_metadata)
-
-        if isinstance(hidden_states, list):
-            logits = [self.score(state)[0] for state in hidden_states]
-        else:
-            logits, _ = self.score(hidden_states)
-
-        pooled_data = self._pooler.head(logits, pooling_metadata)
-        pooled_outputs = [
-            self._pooler.build_output(data.squeeze(-1)) for data in pooled_data
-        ]
-        return PoolerOutput(outputs=pooled_outputs)
-
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-        is_original_qwen3_reranker = getattr(self.config,
-                                             "is_original_qwen3_reranker",
-                                             False)
-
-        if not is_original_qwen3_reranker:
-            loader = AutoWeightsLoader(self)
-            return loader.load_weights(weights)
-
-        return self.load_weights_from_original_qwen3_reranker(weights)
-
-    def load_weights_from_original_qwen3_reranker(
-            self, weights: Iterable[tuple[str, torch.Tensor]]):
-
-        model_config = self.vllm_config.model_config
-        tokens = getattr(self.config, "classifier_from_token", None)
-        device = self.score.weight.device
-
-        if self.config.tie_word_embeddings:
-            self.lm_head = self.model.embed_tokens
-        else:
-            self.lm_head = ParallelLMHead(self.config.vocab_size,
-                                          self.config.hidden_size,
-                                          quant_config=self.quant_config,
-                                          prefix=maybe_prefix(
-                                              self.prefix, "lm_head"))
-
-        loader = AutoWeightsLoader(self)
-        loaded_weights = loader.load_weights(weights)
-
-        from vllm.transformers_utils.tokenizer import get_tokenizer
-        tokenizer = get_tokenizer(
-            model_config.tokenizer,
-            revision=model_config.tokenizer_revision,
-            tokenizer_mode=model_config.tokenizer_mode,
-            trust_remote_code=model_config.trust_remote_code)
-
-        a = tokenizer.convert_tokens_to_ids(tokens[0])
-        b = tokenizer.convert_tokens_to_ids(tokens[1])
-        weight = self.lm_head.weight.data[b].to(
-            device) - self.lm_head.weight.data[a].to(device)
-        self.score.weight.data.copy_(weight)
-
-        del self.lm_head
-        loaded_weights.add("score.weight")
-        loaded_weights.discard("lm_head.weight")
-        return loaded_weights
+Qwen3ForSequenceClassification = as_seq_cls_model(Qwen3ForCausalLM)

From fbd62d87509b34fe17d0f9e81608c6a9b7e2c3ff Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 4 Jul 2025 21:08:02 +0800
Subject: [PATCH 095/167] [Doc] Fix classification table in list of supported
 models (#20489)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/models/supported_models.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index cf062311b00c9..23d71fd445254 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -470,6 +470,7 @@ Specified using `--task classify`.
 |----------------------------------|----------|----------------------------------------|------------------------|-----------------------------|-----------------------|
 | `JambaForSequenceClassification` | Jamba    | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎                     | ✅︎                          |                       |
 | `GPT2ForSequenceClassification`  | GPT2     | `nie3e/sentiment-polish-gpt2-small`    |                        |                             | ✅︎                     |
+
 If your model is not in the above list, we will try to automatically convert the model using
 [as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
 

From 5561681d0400469aba6abfee242be717db47ae0c Mon Sep 17 00:00:00 2001
From: Peter Pan <peter.pan@daocloud.io>
Date: Fri, 4 Jul 2025 21:49:18 +0800
Subject: [PATCH 096/167] [CI] add kvcache-connector dependency definition and
 add into CI build (#18193)

Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
---
 .buildkite/release-pipeline.yaml |  2 +-
 docker/Dockerfile                | 12 ++++++++++++
 requirements/kv_connectors.txt   |  1 +
 3 files changed, 14 insertions(+), 1 deletion(-)
 create mode 100644 requirements/kv_connectors.txt

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index ee13e1aabc891..6314afd652340 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -52,7 +52,7 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
   - label: "Annotate release workflow"
diff --git a/docker/Dockerfile b/docker/Dockerfile
index ec18c45a096a1..c49b5da2714c9 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,3 +1,4 @@
+
 # The vLLM Dockerfile is used to construct vLLM image that can be directly used
 # to run the OpenAI compatible server.
 
@@ -62,12 +63,16 @@ ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
 ARG PIP_KEYRING_PROVIDER=disabled
 ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER}
 
+# Flag enables build-in KV-connector dependency libs into docker images
+ARG INSTALL_KV_CONNECTORS=false
+
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
 FROM ${BUILD_BASE_IMAGE} AS base
 ARG CUDA_VERSION
 ARG PYTHON_VERSION
 ARG TARGETPLATFORM
+ARG INSTALL_KV_CONNECTORS=false
 ENV DEBIAN_FRONTEND=noninteractive
 
 ARG DEADSNAKES_MIRROR_URL
@@ -276,6 +281,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 FROM ${FINAL_BASE_IMAGE} AS vllm-base
 ARG CUDA_VERSION
 ARG PYTHON_VERSION
+ARG INSTALL_KV_CONNECTORS=false
 WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETPLATFORM
@@ -485,6 +491,7 @@ RUN mv mkdocs.yaml test_docs/
 # base openai image with additional requirements, for any subsequent openai-style images
 FROM vllm-base AS vllm-openai-base
 ARG TARGETPLATFORM
+ARG INSTALL_KV_CONNECTORS=false
 
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
@@ -493,8 +500,13 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
 
+COPY requirements/kv_connectors.txt requirements/kv_connectors.txt
+
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
+        uv pip install --system -r requirements/kv_connectors.txt; \
+    fi; \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
         uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
     else \
diff --git a/requirements/kv_connectors.txt b/requirements/kv_connectors.txt
new file mode 100644
index 0000000000000..262675a231206
--- /dev/null
+++ b/requirements/kv_connectors.txt
@@ -0,0 +1 @@
+lmcache
\ No newline at end of file

From ffe00ef77a540087032aa23222a8c06cb7675994 Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Fri, 4 Jul 2025 08:15:03 -0700
Subject: [PATCH 097/167] [Misc] Small: Remove global media connector. Each
 test should have its own test connector object. (#20395)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
---
 vllm/multimodal/utils.py | 66 ++++++++++++++++++++++++++++++++++------
 1 file changed, 57 insertions(+), 9 deletions(-)

diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 22e696141b84b..6a2f7d159a69c 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -43,6 +43,15 @@ class MediaConnector:
         *,
         allowed_local_media_path: str = "",
     ) -> None:
+        """
+        Args:
+            media_io_kwargs: Additional args passed to process media 
+                             inputs, keyed by modalities. For example, 
+                             to set num_frames for video, set 
+                             `--media-io-kwargs '{"video": {"num_frames": 40} }'`
+            connection: HTTP connection client to download media contents.
+            allowed_local_media_path: A local directory to load media files from.
+        """
         super().__init__()
 
         self.media_io_kwargs: dict[str, dict[
@@ -277,15 +286,6 @@ class MediaConnector:
         return image_embedding_io.load_base64("", data)
 
 
-global_media_connector = MediaConnector()
-"""The global [`MediaConnector`][vllm.multimodal.utils.MediaConnector]
-instance used by vLLM."""
-
-fetch_audio = global_media_connector.fetch_audio
-fetch_image = global_media_connector.fetch_image
-fetch_video = global_media_connector.fetch_video
-
-
 def encode_audio_base64(
     audio: np.ndarray,
     sampling_rate: float,
@@ -441,3 +441,51 @@ def run_dp_sharded_vision_model(image_input: torch.Tensor,
                                                          dim=0)
     vision_embeddings = vision_embeddings[:num_chunks, ...]
     return vision_embeddings
+
+
+def fetch_audio(
+    audio_url: str,
+    audio_io_kwargs: Optional[dict[str, Any]] = None,
+) -> tuple[np.ndarray, Union[int, float]]:
+    """
+    Args:
+        audio_url: URL of the audio file to fetch.
+        audio_io_kwargs: Additional kwargs passed to handle audio IO.
+    """
+    media_io_kwargs = None if not audio_io_kwargs else {
+        "audio": audio_io_kwargs
+    }
+    media_connector = MediaConnector(media_io_kwargs=media_io_kwargs)
+    return media_connector.fetch_audio(audio_url)
+
+
+def fetch_image(
+    image_url: str,
+    image_io_kwargs: Optional[dict[str, Any]] = None,
+) -> Image.Image:
+    """
+    Args:
+        image_url: URL of the image file to fetch.
+        image_io_kwargs: Additional kwargs passed to handle image IO.
+    """
+    media_io_kwargs = None if not image_io_kwargs else {
+        "image": image_io_kwargs
+    }
+    media_connector = MediaConnector(media_io_kwargs=media_io_kwargs)
+    return media_connector.fetch_image(image_url)
+
+
+def fetch_video(
+    video_url: str,
+    video_io_kwargs: Optional[dict[str, Any]] = None,
+) -> tuple[npt.NDArray, dict[str, Any]]:
+    """
+    Args:
+        video_url: URL of the video file to fetch.
+        video_io_kwargs: Additional kwargs passed to handle video IO.
+    """
+    media_io_kwargs = None if not video_io_kwargs else {
+        "video": video_io_kwargs
+    }
+    media_connector = MediaConnector(media_io_kwargs=media_io_kwargs)
+    return media_connector.fetch_video(video_url)

From 2f35a022e648f83ea376ee7331deafaba36d7eac Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Fri, 4 Jul 2025 19:46:53 +0200
Subject: [PATCH 098/167] Enable V1 for Hybrid SSM/Attention Models (#20016)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Stanislaw Wozniak <stw@zurich.ibm.com>
Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
---
 .../models/language/generation/test_hybrid.py |  70 ++++++++--
 tests/models/registry.py                      |   2 +-
 tests/v1/test_oracle.py                       |   1 -
 .../layers/mamba/ops/mamba_ssm.py             |   2 +-
 vllm/model_executor/models/bamba.py           |  45 ++++---
 vllm/model_executor/models/falcon_h1.py       |  57 +++++---
 .../model_executor/models/granitemoehybrid.py |  47 ++++---
 vllm/model_executor/models/nemotron_h.py      |  45 ++++---
 vllm/model_executor/models/zamba2.py          |  99 +++++++++-----
 vllm/v1/core/kv_cache_coordinator.py          |  11 +-
 vllm/v1/core/kv_cache_manager.py              |  16 ++-
 vllm/v1/core/kv_cache_utils.py                |   6 +-
 vllm/v1/kv_cache_interface.py                 |   7 +-
 vllm/v1/worker/gpu_model_runner.py            | 125 ++++++++++++++++--
 14 files changed, 399 insertions(+), 134 deletions(-)

diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index e6dd6c35e64d6..ecaae3ec1fc49 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -3,6 +3,7 @@
 
 import pytest
 
+from tests.models.registry import HF_EXAMPLE_MODELS
 from tests.utils import multi_gpu_test
 from vllm.engine.arg_utils import EngineArgs
 from vllm.sampling_params import SamplingParams
@@ -19,31 +20,55 @@ pytestmark = pytest.mark.hybrid_model
 SSM_MODELS = [
     "state-spaces/mamba-130m-hf",
     "tiiuae/falcon-mamba-tiny-dev",
-    # TODO: Compare to a Mamba2 model. The HF transformers implementation of
-    # Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
-    # doesn't compare vLLM output with HF output.
-    # See https://github.com/huggingface/transformers/pull/35943
     "mistralai/Mamba-Codestral-7B-v0.1",
 ]
 
 HYBRID_MODELS = [
     "ai21labs/Jamba-tiny-dev",
-    # NOTE: Currently the test failes due to HF transformers issue fixed in:
-    # https://github.com/huggingface/transformers/pull/39033
-    # We will enable vLLM test for Granite after next HF transformers release.
-    # "ibm-granite/granite-4.0-tiny-preview",
     # NOTE: Running Plamo2 in transformers implementation requires to install
     # causal-conv1d package, which is not listed as a test dependency as it's
     # not compatible with pip-compile.
     "pfnet/plamo-2-1b",
     "Zyphra/Zamba2-1.2B-instruct",
     "hmellor/tiny-random-BambaForCausalLM",
+    "ibm-ai-platform/Bamba-9B-v1",
+    "nvidia/Nemotron-H-8B-Base-8K",
+    "ibm-granite/granite-4.0-tiny-preview",
+    "tiiuae/Falcon-H1-0.5B-Base",
+]
+
+HF_UNSUPPORTED_MODELS = [
+    # The HF transformers implementation of
+    # Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
+    # doesn't compare vLLM output with HF output.
+    # See https://github.com/huggingface/transformers/pull/35943
+    "mistralai/Mamba-Codestral-7B-v0.1",
+    # Note: I'm not seeing the same output from vLLM V0 vs. HF transformers
+    # for Nemotron-H-8B; currently only compare vLLM V0 vs. vLLM V1
+    "nvidia/Nemotron-H-8B-Base-8K",
+    # NOTE: Currently the test fails due to HF transformers issue fixed in:
+    # https://github.com/huggingface/transformers/pull/39033
+    # We will enable vLLM test for Granite after next HF transformers release.
+    "ibm-granite/granite-4.0-tiny-preview",
 ]
 
 V1_SUPPORTED_MODELS = [
     "mistralai/Mamba-Codestral-7B-v0.1",
+    "ibm-ai-platform/Bamba-9B-v1",
+    "Zyphra/Zamba2-1.2B-instruct",
+    "nvidia/Nemotron-H-8B-Base-8K",
+    "ibm-granite/granite-4.0-tiny-preview",
+    "tiiuae/Falcon-H1-0.5B-Base",
 ]
 
+ATTN_BLOCK_SIZES = {
+    "ibm-ai-platform/Bamba-9B-v1": 528,
+    "Zyphra/Zamba2-1.2B-instruct": 80,
+    "nvidia/Nemotron-H-8B-Base-8K": 528,
+    "ibm-granite/granite-4.0-tiny-preview": 400,
+    "tiiuae/Falcon-H1-0.5B-Base": 800,
+}
+
 # Avoid OOM
 MAX_NUM_SEQS = 4
 
@@ -60,8 +85,16 @@ def test_models(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
+
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
     with hf_runner(model) as hf_model:
-        if model != "mistralai/Mamba-Codestral-7B-v0.1":
+        if model not in HF_UNSUPPORTED_MODELS:
             hf_outputs = hf_model.generate_greedy_logprobs_limit(
                 example_prompts, max_tokens, num_logprobs)
         else:
@@ -72,12 +105,21 @@ def test_models(
             example_prompts, max_tokens, num_logprobs)
 
     if model in V1_SUPPORTED_MODELS:
+        if model in HYBRID_MODELS and model in ATTN_BLOCK_SIZES:
+            block_size = ATTN_BLOCK_SIZES[model]
+        else:
+            block_size = 16
+
         with monkeypatch.context() as m:
             m.setenv("VLLM_USE_V1", "1")
+            if model in HYBRID_MODELS:
+                # required due to reorder_batch behaviour
+                m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
             with vllm_runner(model,
                              max_num_seqs=MAX_NUM_SEQS,
                              enforce_eager=True,
-                             enable_prefix_caching=False) as vllm_model:
+                             enable_prefix_caching=False,
+                             block_size=block_size) as vllm_model:
                 vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
                     example_prompts, max_tokens, num_logprobs)
     else:
@@ -111,6 +153,14 @@ def test_batching(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
+
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
     for_loop_outputs = []
     with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
         for prompt in example_prompts:
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 704aa76b84d4b..728c18643a003 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -169,7 +169,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"),  # noqa: E501
     "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"),  # noqa: E501
     "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
-    "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-1.5B-Instruct",
+    "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base",
                                           min_transformers_version="4.53"),
     "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
     "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
index d640d7dc49d1d..7a7ba346a7197 100644
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -13,7 +13,6 @@ UNSUPPORTED_MODELS_V1 = [
     "openai/whisper-large-v3",  # transcription
     "facebook/bart-large-cnn",  # encoder decoder
     "state-spaces/mamba-130m-hf",  # mamba1
-    "hmellor/tiny-random-BambaForCausalLM",  # hybrid
     "BAAI/bge-m3",  # embedding
 ]
 
diff --git a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
index ccfb278cdff6c..3f67fc35afdfc 100644
--- a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
+++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
@@ -108,7 +108,7 @@ def _selective_scan_update_kernel(
     # is the same as the batch id.
     if HAS_STATE_BATCH_INDICES:
         state_batch_indices_ptr += pid_b
-        state_batch_idx = tl.load(state_batch_indices_ptr)
+        state_batch_idx = tl.load(state_batch_indices_ptr).to(tl.int64)
         state_ptr += (state_batch_idx * stride_state_batch +
                       pid_h * stride_state_head)
     else:
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index 29e0e2a2edb15..d743c52074c68 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -9,6 +9,7 @@ import torch
 from torch import nn
 from transformers import BambaConfig
 
+from vllm import envs
 from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
@@ -36,7 +37,7 @@ from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType
 
 from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
-                         SupportsQuant, SupportsV0Only)
+                         SupportsQuant)
 from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -97,7 +98,9 @@ class BambaMixerDecoderLayer(nn.Module):
                                 head_dim=config.mamba_d_head,
                                 rms_norm_eps=config.rms_norm_eps,
                                 activation=config.hidden_act,
-                                quant_config=quant_config)
+                                quant_config=quant_config,
+                                prefix=f"{prefix}.mixer",
+                                chunk_size=config.mamba_chunk_size)
 
         self.feed_forward = BambaMLP(config, quant_config=quant_config)
         self.input_layernorm = RMSNorm(config.hidden_size,
@@ -313,10 +316,14 @@ class BambaModel(nn.Module):
 
         attn_metadata = get_forward_context().attn_metadata
 
-        mamba2_metadata = prepare_mamba2_metadata(
-            chunk_size=self.config.mamba_chunk_size,
-            attn_metadata=attn_metadata,
-        )
+        if not envs.VLLM_USE_V1:
+            mamba2_metadata = prepare_mamba2_metadata(
+                chunk_size=self.config.mamba_chunk_size,
+                attn_metadata=attn_metadata,
+            )
+        else:
+            # v1 get mamba2_metadata from forward_context
+            mamba2_metadata = None
 
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
@@ -337,7 +344,8 @@ class BambaModel(nn.Module):
                 num_attn += 1
 
             layer_mamba_cache_params = None
-            if isinstance(layer, BambaMixerDecoderLayer):
+            if isinstance(layer,
+                          BambaMixerDecoderLayer) and mamba_cache_params:
                 layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
                     i - num_attn)
 
@@ -411,7 +419,7 @@ class BambaModel(nn.Module):
 
 
 class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
-                       IsHybrid, SupportsV0Only, SupportsQuant):
+                       IsHybrid, SupportsQuant):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -475,15 +483,22 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
-        if self.mamba_cache is None:
 
-            num_mamba_layers = self.model_config.get_num_layers_by_block_type(
-                self.vllm_config.parallel_config, LayerBlockType.mamba)
+        mamba_cache_params = None
+        if not envs.VLLM_USE_V1:
+            if self.mamba_cache is None:
+                num_mamba_layers = \
+                    self.model_config.get_num_layers_by_block_type(
+                        self.vllm_config.parallel_config,
+                        LayerBlockType.mamba
+                    )
+
+                self.mamba_cache = MambaCacheManager(
+                    self.vllm_config, self.lm_head.weight.dtype,
+                    num_mamba_layers, *self._get_mamba_cache_shape())
+
+            mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
-            self.mamba_cache = MambaCacheManager(
-                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
-                *self._get_mamba_cache_shape())
-        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
         hidden_states = self.model(input_ids, positions, mamba_cache_params,
                                    intermediate_tensors, inputs_embeds)
 
diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py
index 28f257eabed01..a76e1f256e049 100644
--- a/vllm/model_executor/models/falcon_h1.py
+++ b/vllm/model_executor/models/falcon_h1.py
@@ -8,6 +8,7 @@ import torch
 from torch import nn
 from transformers import FalconH1Config
 
+from vllm import envs
 from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
@@ -33,8 +34,7 @@ from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
-                         SupportsV0Only)
+from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP
 from .utils import (PPMissingLayer, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -85,6 +85,7 @@ class FalconH1SSMDecoderLayer(nn.Module):
         config: FalconH1Config,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.config = config
@@ -107,6 +108,8 @@ class FalconH1SSMDecoderLayer(nn.Module):
             activation=config.hidden_act,
             quant_config=quant_config,
             use_rms_norm=config.mamba_rms_norm,
+            prefix=f"{prefix}.mixer",
+            chunk_size=config.mamba_chunk_size,
         )
         # n_groups is overridden later by `MambaMixer2`
         self.groups_time_state_size = self.mamba.n_groups * config.mamba_d_state
@@ -316,6 +319,7 @@ class FalconH1ParallelHybrid(nn.Module):
         prefix: str = "",
     ) -> None:
         super().__init__()
+
         # Instantiate the attention branch
         self.self_attn = FalconH1AttentionDecoderLayer(
             config=config,
@@ -323,11 +327,18 @@ class FalconH1ParallelHybrid(nn.Module):
             quant_config=quant_config,
             prefix=prefix,
         )
+
+        # In V1 all attention/ssm layers must have
+        # different index in prefix
+        ssm_layer_idx = config.num_hidden_layers + layer_idx
+        ssm_prefix = prefix.split(".")[0] + f".{ssm_layer_idx}"
+
         # Instantiate the SSM branch
         self.mamba = FalconH1SSMDecoderLayer(
             config=config,
             cache_config=cache_config,
             quant_config=quant_config,
+            prefix=ssm_prefix,
         )
         self.ssm_out_multiplier = config.ssm_out_multiplier
         self.ssm_in_multiplier = config.ssm_in_multiplier
@@ -452,10 +463,16 @@ class FalconH1Model(nn.Module):
         # proper continuous batching computation including
         # chunked prefill
         attn_metadata = get_forward_context().attn_metadata
-        mamba2_metadata = prepare_mamba2_metadata(
-            chunk_size=self.config.mamba_chunk_size,
-            attn_metadata=attn_metadata,
-        )
+
+        if not envs.VLLM_USE_V1:
+            mamba2_metadata = prepare_mamba2_metadata(
+                chunk_size=self.config.mamba_chunk_size,
+                attn_metadata=attn_metadata,
+            )
+        else:
+            # v1 get mamba2_metadata from forward_context
+            mamba2_metadata = None
+
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds * self.embedding_multiplier
@@ -468,7 +485,9 @@ class FalconH1Model(nn.Module):
 
         for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
-            layer_mamba_cache_params = mamba_cache_params.at_layer_idx(i)
+            layer_mamba_cache_params = None
+            if mamba_cache_params:
+                layer_mamba_cache_params = mamba_cache_params.at_layer_idx(i)
             hidden_states = layer(
                 positions=positions,
                 hidden_states=hidden_states,
@@ -484,7 +503,7 @@ class FalconH1Model(nn.Module):
 
 
 class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
-                          IsHybrid, SupportsV0Only):
+                          IsHybrid):
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
@@ -558,15 +577,19 @@ class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs,
     ):
-        if self.mamba_cache is None:
-            self.mamba_cache = MambaCacheManager(
-                self.vllm_config,
-                self.lm_head.weight.dtype
-                if hasattr(self.lm_head, 'weight') else torch.bfloat16,
-                self.config.num_hidden_layers,
-                *self._get_mamba_cache_shape(),
-            )
-        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+
+        mamba_cache_params = None
+        if not envs.VLLM_USE_V1:
+            if self.mamba_cache is None:
+                self.mamba_cache = MambaCacheManager(
+                    self.vllm_config,
+                    self.lm_head.weight.dtype if hasattr(
+                        self.lm_head, 'weight') else torch.bfloat16,
+                    self.config.num_hidden_layers,
+                    *self._get_mamba_cache_shape(),
+                )
+            mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+
         hidden_states = self.model(
             input_ids,
             positions,
diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index 33e8626209d50..676ef24fc4da7 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -9,6 +9,7 @@ import torch
 from torch import nn
 from transformers import GraniteMoeHybridConfig
 
+from vllm import envs
 from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
@@ -36,7 +37,7 @@ from vllm.utils import LayerBlockType
 from .granitemoe import GraniteMoeMoE
 from .granitemoeshared import GraniteMoeSharedMLP
 from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
-                         SupportsQuant, SupportsV0Only)
+                         SupportsQuant)
 from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -67,7 +68,9 @@ class GraniteMoeHybridMambaDecoderLayer(nn.Module):
                                 head_dim=config.mamba_d_head,
                                 rms_norm_eps=config.rms_norm_eps,
                                 activation=config.hidden_act,
-                                quant_config=quant_config)
+                                quant_config=quant_config,
+                                prefix=f"{prefix}.mixer",
+                                chunk_size=config.mamba_chunk_size)
 
         self.block_sparse_moe = None
         if getattr(config, "num_local_experts", 0) > 0:
@@ -361,10 +364,15 @@ class GraniteMoeHybridModel(nn.Module):
     ) -> torch.Tensor:
 
         attn_metadata = get_forward_context().attn_metadata
-        mamba2_metadata = prepare_mamba2_metadata(
-            chunk_size=self.config.mamba_chunk_size,
-            attn_metadata=attn_metadata,
-        )
+
+        if not envs.VLLM_USE_V1:
+            mamba2_metadata = prepare_mamba2_metadata(
+                chunk_size=self.config.mamba_chunk_size,
+                attn_metadata=attn_metadata,
+            )
+        else:
+            # v1 get mamba2_metadata from forward_context
+            mamba2_metadata = None
 
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
@@ -386,7 +394,9 @@ class GraniteMoeHybridModel(nn.Module):
                 num_attn += 1
 
             layer_mamba_cache_params = None
-            if isinstance(layer, GraniteMoeHybridMambaDecoderLayer):
+            if isinstance(
+                    layer,
+                    GraniteMoeHybridMambaDecoderLayer) and mamba_cache_params:
                 layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
                     i - num_attn)
 
@@ -501,8 +511,7 @@ class GraniteMoeHybridModel(nn.Module):
 
 
 class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA,
-                                  SupportsPP, IsHybrid, SupportsV0Only,
-                                  SupportsQuant):
+                                  SupportsPP, IsHybrid, SupportsQuant):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -571,14 +580,20 @@ class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
-        if self.mamba_cache is None:
-            num_mamba_layers = self.model_config.get_num_layers_by_block_type(
-                self.vllm_config.parallel_config, LayerBlockType.mamba)
-            self.mamba_cache = MambaCacheManager(
-                self.vllm_config, self.model_config.dtype, num_mamba_layers,
-                *self._get_mamba_cache_shape())
 
-        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+        mamba_cache_params = None
+        if not envs.VLLM_USE_V1:
+            if self.mamba_cache is None:
+                num_mamba_layers = (
+                    self.model_config.get_num_layers_by_block_type(
+                        self.vllm_config.parallel_config,
+                        LayerBlockType.mamba))
+                self.mamba_cache = MambaCacheManager(
+                    self.vllm_config, self.model_config.dtype,
+                    num_mamba_layers, *self._get_mamba_cache_shape())
+
+            mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+
         hidden_states = self.model(input_ids, positions, mamba_cache_params,
                                    intermediate_tensors, inputs_embeds)
 
diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index 3424efa80d48f..5d51b01df9db8 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -23,6 +23,7 @@ from typing import Optional
 import torch
 from torch import nn
 
+from vllm import envs
 from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
@@ -44,8 +45,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid,
                                                    SupportsLoRA, SupportsPP,
-                                                   SupportsQuant,
-                                                   SupportsV0Only)
+                                                   SupportsQuant)
 from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
                                                     MambaCacheParams)
 from vllm.model_executor.models.utils import (
@@ -153,6 +153,8 @@ class NemotronHMambaDecoderLayer(nn.Module):
             rms_norm_eps=config.rms_norm_eps,
             activation=config.mamba_hidden_act,
             quant_config=quant_config,
+            prefix=f"{prefix}.mixer",
+            chunk_size=config.chunk_size,
         )
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -348,10 +350,14 @@ class NemotronHModel(nn.Module):
 
         attn_metadata = get_forward_context().attn_metadata
 
-        mamba2_metadata = prepare_mamba2_metadata(
-            chunk_size=self.config.chunk_size,
-            attn_metadata=attn_metadata,
-        )
+        if not envs.VLLM_USE_V1:
+            mamba2_metadata = prepare_mamba2_metadata(
+                chunk_size=self.config.chunk_size,
+                attn_metadata=attn_metadata,
+            )
+        else:
+            # v1 get mamba2_metadata from forward_context
+            mamba2_metadata = None
 
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
@@ -369,7 +375,8 @@ class NemotronHModel(nn.Module):
         for i in range(len(self.layers)):
             layer = self.layers[i]
             layer_mamba_cache_params = None
-            if isinstance(layer, NemotronHMambaDecoderLayer):
+            if isinstance(layer,
+                          NemotronHMambaDecoderLayer) and mamba_cache_params:
                 layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
                     i - num_non_mamba_layers)
             else:
@@ -437,7 +444,7 @@ class NemotronHModel(nn.Module):
 
 
 class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
-                           IsHybrid, SupportsV0Only, SupportsQuant):
+                           IsHybrid, SupportsQuant):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -499,15 +506,23 @@ class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
-        if self.mamba_cache is None:
 
-            num_mamba_layers = self.model_config.get_num_layers_by_block_type(
-                self.vllm_config.parallel_config, LayerBlockType.mamba)
+        mamba_cache_params = None
+        if not envs.VLLM_USE_V1:
+            if self.mamba_cache is None:
+
+                num_mamba_layers = \
+                    self.model_config.get_num_layers_by_block_type(
+                        self.vllm_config.parallel_config,
+                        LayerBlockType.mamba
+                    )
+
+                self.mamba_cache = MambaCacheManager(
+                    self.vllm_config, self.lm_head.weight.dtype,
+                    num_mamba_layers, *self._get_mamba_cache_shape())
+
+            mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
-            self.mamba_cache = MambaCacheManager(
-                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
-                *self._get_mamba_cache_shape())
-        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
         hidden_states = self.model(input_ids, positions, mamba_cache_params,
                                    intermediate_tensors, inputs_embeds)
 
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index a4f97c774f706..54c80cfa59229 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -15,6 +15,7 @@ import torch
 from torch import nn
 from transformers import Zamba2Config
 
+from vllm import envs
 from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
@@ -41,7 +42,7 @@ from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import HasInnerState, IsHybrid, SupportsV0Only
+from .interfaces import HasInnerState, IsHybrid
 from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 
 
@@ -58,6 +59,7 @@ class Zamba2LoRA(nn.Module):
         rank: int,
         output_dim: Union[int, list[int]],
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         """Initialize the attention layer.
         
@@ -283,6 +285,7 @@ class Zamba2MLP(nn.Module):
         bare_block_idx: int,
         num_hybrid_layers: dict[int, int],
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         """Initialize the MLP layer.
         
@@ -471,11 +474,10 @@ class Zamba2MambaDecoderLayer(nn.Module):
     computation depending on configuration.
     """
 
-    def __init__(
-        self,
-        config: Zamba2Config,
-        quant_config: Optional[QuantizationConfig] = None,
-    ) -> None:
+    def __init__(self,
+                 config: Zamba2Config,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
         """Initialize the Mamba decoder layer.
         
         Args:
@@ -486,20 +488,21 @@ class Zamba2MambaDecoderLayer(nn.Module):
 
         # Initialize Mamba mixer with expanded intermediate size
         intermediate_size = config.mamba_expand * config.hidden_size
-        self.mamba = MambaMixer2(
-            hidden_size=config.hidden_size,
-            ssm_state_size=config.mamba_d_state,
-            conv_kernel_size=config.mamba_d_conv,
-            intermediate_size=intermediate_size,
-            use_conv_bias=config.use_conv_bias,
-            use_bias=config.add_bias_linear,
-            n_groups=config.mamba_ngroups,
-            num_heads=config.n_mamba_heads,
-            head_dim=intermediate_size // config.n_mamba_heads,
-            rms_norm_eps=config.rms_norm_eps,
-            activation="silu",
-            quant_config=quant_config,
-        )
+        self.mamba = MambaMixer2(hidden_size=config.hidden_size,
+                                 ssm_state_size=config.mamba_d_state,
+                                 conv_kernel_size=config.mamba_d_conv,
+                                 intermediate_size=intermediate_size,
+                                 use_conv_bias=config.use_conv_bias,
+                                 use_bias=config.add_bias_linear,
+                                 n_groups=config.mamba_ngroups,
+                                 num_heads=config.n_mamba_heads,
+                                 head_dim=intermediate_size //
+                                 config.n_mamba_heads,
+                                 rms_norm_eps=config.rms_norm_eps,
+                                 activation="silu",
+                                 quant_config=quant_config,
+                                 prefix=f"{prefix}.mixer",
+                                 chunk_size=config.chunk_size)
 
         # Input normalization
         self.input_layernorm = RMSNorm(config.hidden_size,
@@ -573,6 +576,7 @@ class Zamba2HybridLayer(nn.Module):
         config: Zamba2Config,
         block_idx: int,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         """Initialize the hybrid layer.
         
@@ -589,7 +593,8 @@ class Zamba2HybridLayer(nn.Module):
                                        bias=False,
                                        quant_config=quant_config)
         self.mamba_decoder = Zamba2MambaDecoderLayer(config,
-                                                     quant_config=quant_config)
+                                                     quant_config=quant_config,
+                                                     prefix=prefix)
 
     def forward(
         self,
@@ -699,14 +704,23 @@ class Zamba2Model(nn.Module):
         # Initialize layers according to block type configuration
         layers = []
         for layer_idx, layer_type in enumerate(config.layers_block_type):
+            # tdoublep: avoid layers getting same index
+            # somewhat hacky but correct (I think)
+            prefix = str(len(layer2block_map) + layer_idx)
             if layer_type == "hybrid":
                 block = next(blocks)
                 block_idx = layer2block_map[layer_idx]
                 layers.append(
-                    Zamba2HybridLayer(block, config, block_idx, quant_config))
+                    Zamba2HybridLayer(block,
+                                      config,
+                                      block_idx,
+                                      quant_config,
+                                      prefix=prefix))
             else:
                 layers.append(
-                    Zamba2MambaDecoderLayer(config, quant_config=quant_config))
+                    Zamba2MambaDecoderLayer(config,
+                                            quant_config=quant_config,
+                                            prefix=prefix))
         self.layers = nn.ModuleList(layers)
 
         # Final layer normalization
@@ -751,19 +765,30 @@ class Zamba2Model(nn.Module):
 
         attn_metadata = get_forward_context().attn_metadata
 
-        mamba2_metadata = prepare_mamba2_metadata(
-            chunk_size=self.config.chunk_size,
-            attn_metadata=attn_metadata,
-        )
+        if not envs.VLLM_USE_V1:
+            mamba2_metadata = prepare_mamba2_metadata(
+                chunk_size=self.config.chunk_size,
+                attn_metadata=attn_metadata,
+            )
+        else:
+            # v1 get mamba2_metadata from forward_context
+            mamba2_metadata = None
 
         # Process through layers
         original_hidden_states = torch.clone(hidden_states)
         for layer_idx, layer in enumerate(self.layers):
+
+            layer_mamba_cache_params = None
+            if (isinstance(layer, (Zamba2HybridLayer, Zamba2MambaDecoderLayer))
+                    and mamba_cache_params):
+                layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
+                    layer_idx)
+
             layer_outputs = layer(
                 hidden_states,
                 original_hidden_states=original_hidden_states,
                 positions=positions,
-                mamba_cache_params=mamba_cache_params.at_layer_idx(layer_idx),
+                mamba_cache_params=layer_mamba_cache_params,
                 mamba2_metadata=mamba2_metadata,
             )
             hidden_states = layer_outputs
@@ -803,7 +828,7 @@ class Zamba2Model(nn.Module):
         return loaded_params
 
 
-class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only):
+class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
     """Zamba2 model with causal language modeling head.
     
     This class wraps the core Zamba2 model and adds:
@@ -897,14 +922,16 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only):
             Output hidden states
         """
         # Initialize Mamba cache if needed
-        if self.mamba_cache is None:
-            num_mamba_layers = self.config.num_hidden_layers
-            self.mamba_cache = MambaCacheManager(
-                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
-                *self._get_mamba_cache_shape())
+        mamba_cache_params = None
+        if not envs.VLLM_USE_V1:
+            if self.mamba_cache is None:
+                num_mamba_layers = self.config.num_hidden_layers
+                self.mamba_cache = MambaCacheManager(
+                    self.vllm_config, self.lm_head.weight.dtype,
+                    num_mamba_layers, *self._get_mamba_cache_shape())
 
-        # Get cache parameters for current run
-        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+            # Get cache parameters for current run
+            mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
         # Forward pass through model
         hidden_states = self.model(
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index b88a5990ca921..38de00625e3f7 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -27,6 +27,7 @@ class KVCacheCoordinator(ABC):
     ):
         self.kv_cache_config = kv_cache_config
         self.max_model_len = max_model_len
+        self.enable_caching = enable_caching
 
         self.block_pool = BlockPool(kv_cache_config.num_blocks, enable_caching,
                                     enable_kv_cache_events)
@@ -268,9 +269,13 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
 
         self.full_attention_block_size = self.full_attention_spec.block_size
         self.other_block_size = self.other_spec.block_size
-        assert self.other_block_size % self.full_attention_block_size == 0, (
-            "KVCacheCoordinator assumes the block_size of full attention "
-            "layers is divisible by other layers now.")
+
+        if self.enable_caching:
+            # this requirement is only needed for the prefix caching logic
+            divisible = self.other_block_size % self.full_attention_block_size
+            assert divisible == 0, (
+                "KVCacheCoordinator assumes the block_size of full "
+                "attention layers is divisible by other layers now.")
 
         if max(self.full_attention_group_ids) < min(self.other_group_ids):
             self.full_attn_first = True
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 08bb0efb2f3dd..6937455e7d859 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -84,12 +84,15 @@ class KVCacheManager:
         self.log_stats = log_stats
         # FIXME: make prefix cache stats conditional on log_stats
         self.prefix_cache_stats = PrefixCacheStats() if log_stats else None
-        assert len(
-            set(g.kv_cache_spec.block_size
-                for g in kv_cache_config.kv_cache_groups)
-        ) == 1, "Only one block size is supported for now"
-        self.block_size = kv_cache_config.kv_cache_groups[
-            0].kv_cache_spec.block_size
+
+        self.block_size: Optional[int] = None
+        if self.enable_caching:
+            assert len(
+                set(g.kv_cache_spec.block_size
+                    for g in kv_cache_config.kv_cache_groups)
+            ) == 1, "Only one block size is supported for now"
+            self.block_size = kv_cache_config.kv_cache_groups[
+                0].kv_cache_spec.block_size
 
         self.coordinator = get_kv_cache_coordinator(
             kv_cache_config=kv_cache_config,
@@ -154,6 +157,7 @@ class KVCacheManager:
         # if the scheduler has tried to schedule the request before.
         block_hashes = self.req_to_block_hashes[request.request_id]
         if not block_hashes:
+            assert self.block_size is not None
             block_hashes = hash_request_tokens(self.caching_hash_fn,
                                                self.block_size, request)
             self.req_to_block_hashes[request.request_id] = block_hashes
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 9489bcf433fd2..2fbcb569e3d56 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -864,9 +864,11 @@ def _get_kv_cache_config_uniform_page_size(
         kv_cache_groups=kv_cache_groups,
     )
 
+    min_block_size = min(
+        [group.kv_cache_spec.block_size for group in kv_cache_groups])
+
     # Print the KV cache size and maximum concurrency.
-    num_tokens = num_blocks // len(
-        grouped_layers) * vllm_config.cache_config.block_size
+    num_tokens = num_blocks // len(grouped_layers) * min_block_size
     num_tokens_str = f"{num_tokens:,}"
     logger.info("GPU KV cache size: %s tokens", num_tokens_str)
     max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index c48775adc9b86..43456a987defc 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -159,6 +159,7 @@ class SlidingWindowSpec(AttentionSpec):
 class MambaSpec(KVCacheSpec):
     shapes: tuple[tuple[int, ...], ...]
     dtype: torch.dtype
+    page_size_padded: Optional[int] = None
 
     def __post_init__(self):
         self.num_elements = sum(prod(shape) for shape in self.shapes)
@@ -169,7 +170,11 @@ class MambaSpec(KVCacheSpec):
 
     @property
     def page_size_bytes(self) -> int:
-        return self.num_elements * get_dtype_size(self.dtype)
+        page_size = self.num_elements * get_dtype_size(self.dtype)
+        if self.page_size_padded is not None:
+            assert self.page_size_padded >= page_size
+            return self.page_size_padded
+        return page_size
 
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         # We allocate 1 block for each request now, so max_memory_usage_bytes is
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4786d047acb5a..57d0c7b50ff51 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -334,6 +334,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         # the same order of requests. We ensure this by only allowing the first
         # group to reorder the batch and asserting that all other groups do not
         # reorder the batch.
+        # TODO(tdoublep): make this more flexible so that any group can
+        # re-order the batch (not only the first).
+        # TODO(tdoublep): verify this during engine init instead of at runtime
         for i in range(1, len(self.kv_cache_config.kv_cache_groups)):
             batch_reordered = self.attn_metadata_builders[i].reorder_batch(
                 self.input_batch, scheduler_output)
@@ -2449,6 +2452,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             corresponding memory buffer for KV cache.
         """
         kv_caches: dict[str, torch.Tensor] = {}
+        has_attn, has_mamba = False, False
         for i, kv_cache_group_spec in enumerate(
                 kv_cache_config.kv_cache_groups):
             kv_cache_spec = kv_cache_group_spec.kv_cache_spec
@@ -2458,6 +2462,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 num_blocks = (raw_tensor.numel() //
                               kv_cache_spec.page_size_bytes)
                 if isinstance(kv_cache_spec, AttentionSpec):
+                    has_attn = True
                     kv_cache_shape = self.attn_backends[i].get_kv_cache_shape(
                         num_blocks, kv_cache_spec.block_size,
                         kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
@@ -2486,25 +2491,67 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                         layer_name].view(dtype).view(kv_cache_shape).permute(
                             *inv_order)
                 elif isinstance(kv_cache_spec, MambaSpec):
+                    has_mamba = True
                     raw_tensor = kv_cache_raw_tensors[layer_name]
                     dtype = kv_cache_spec.dtype
+                    num_element_per_page = (kv_cache_spec.page_size_bytes //
+                                            get_dtype_size(dtype))
                     state_tensors = []
-                    start_pos = 0
+                    storage_offset = 0
                     for shape in kv_cache_spec.shapes:
                         target_shape = (num_blocks, *shape)
-                        size_in_bytes = np.prod(shape) * get_dtype_size(
-                            dtype) * num_blocks
-                        tensor = raw_tensor[start_pos:start_pos +
-                                            size_in_bytes]
-                        tensor = tensor.view(dtype).view(target_shape)
+                        stride = torch.empty(target_shape).stride()
+                        target_stride = (num_element_per_page, *stride[1:])
+                        tensor = torch.as_strided(
+                            raw_tensor.view(dtype),
+                            size=target_shape,
+                            stride=target_stride,
+                            storage_offset=storage_offset,
+                        )
                         state_tensors.append(tensor)
-                        start_pos += size_in_bytes
-                    assert start_pos == raw_tensor.numel()
-                    kv_caches[layer_name] = tuple(state_tensors)
+                        storage_offset += stride[0]
+
+                    kv_caches[layer_name] = state_tensors
                 else:
                     raise NotImplementedError
+
+        if has_attn and has_mamba:
+            self._verify_hybrid_attention_mamba_layout(kv_cache_config,
+                                                       kv_cache_raw_tensors)
+
         return kv_caches
 
+    def _verify_hybrid_attention_mamba_layout(
+            self, kv_cache_config: KVCacheConfig,
+            kv_cache_raw_tensors: dict[str, torch.Tensor]) -> None:
+        """
+        Verify that the KV cache memory layout is compatible for
+        models with both attention and mamba KV cache groups.
+
+        Args:
+            kv_cache_config: The KV cache config
+            kv_cache_raw_tensors: The KV cache buffer of each layer.
+        """
+
+        for i, kv_cache_group_spec in enumerate(
+                kv_cache_config.kv_cache_groups):
+            kv_cache_spec = kv_cache_group_spec.kv_cache_spec
+            for layer_name in kv_cache_group_spec.layer_names:
+                raw_tensor = kv_cache_raw_tensors[layer_name]
+                num_blocks = (raw_tensor.numel() //
+                              kv_cache_spec.page_size_bytes)
+                if isinstance(kv_cache_spec, AttentionSpec):
+                    kv_cache_shape = self.attn_backends[i].get_kv_cache_shape(
+                        num_blocks, kv_cache_spec.block_size,
+                        kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
+                    if kv_cache_shape[0] != num_blocks or kv_cache_shape[
+                            1] != 2:
+                        raise ValueError(
+                            "Hybrid models in V1 require an attention "
+                            "backend with kv_cache_shape="
+                            "(num_blocks, 2, ...). Please try setting "
+                            "VLLM_ATTENTION_BACKEND=FLASHINFER")
+
     def initialize_kv_cache_tensors(
             self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]:
         """
@@ -2623,11 +2670,69 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 raise NotImplementedError(
                     "Prefix caching is not supported for Mamba yet.")
             max_model_len = self.vllm_config.model_config.max_model_len
+
+            page_size_padded = self._maybe_pad_mamba_page_size(
+                attn_layers, mamba_layers, kv_cache_spec, max_model_len,
+                block_size)
+
             # Set block_size to max_model_len, so that mamba model will always
             # have only one block in the KV cache.
             for layer_name, mamba_module in mamba_layers.items():
                 kv_cache_spec[layer_name] = MambaSpec(
                     shapes=mamba_module.get_state_shape(),
                     dtype=self.kv_cache_dtype,
-                    block_size=max_model_len)
+                    block_size=max_model_len,
+                    page_size_padded=page_size_padded)
+
         return kv_cache_spec
+
+    def _maybe_pad_mamba_page_size(
+        self,
+        attn_layers: dict[str, Attention],
+        mamba_layers: dict[str, MambaMixer2],
+        kv_cache_spec: dict[str, KVCacheSpec],
+        max_model_len: int,
+        block_size: int,
+    ) -> Optional[int]:
+        """
+        Ensure that page size of attention KV cache groups is greater than or
+        equal to the mamba KV cache groups. If not, we suggest to the user
+        how to set the attention block size to ensure that it is.
+
+        If the attention page size is strictly greater than the mamba page size,
+        we pad the mamba page size to make them equal.
+
+        Args:
+            attn_layers: Attention layers
+            mamba_layers: Mamba layers
+            kv_cache_spec: KV cache spec (populated with attention layers)
+
+        Returns:
+            Optional[int]: Mamba page size with padding (None if no padding).
+        """
+
+        if len(attn_layers) == 0:
+            return None
+
+        attn_layer_name = next(iter(attn_layers))
+        attn_page_size = kv_cache_spec[attn_layer_name].page_size_bytes
+        mamba_layer_name = next(iter(mamba_layers))
+        mamba_page_size = MambaSpec(
+            shapes=mamba_layers[mamba_layer_name].get_state_shape(),
+            dtype=self.kv_cache_dtype,
+            block_size=max_model_len).page_size_bytes
+        if attn_page_size < mamba_page_size:
+            # attention page size (for 16 tokens)
+            attn_page_size_16 = 16 * attn_page_size // block_size
+            # some attention backends (e.g. FA) only support setting
+            # block size to multiple of 16, so let's suggest a value
+            # that would work (note: FA is currently not compatible
+            # with mamba layers, use FlashInfer instead).
+            suggest_attn_block_size = 16 * cdiv(mamba_page_size,
+                                                attn_page_size_16)
+            raise ValueError(
+                "Attention block size should be increased to at least "
+                f"{suggest_attn_block_size} in order to match "
+                "the mamba page size")
+
+        return attn_page_size

From 3d184b95b83bd1c755811238b013b51c49da3951 Mon Sep 17 00:00:00 2001
From: Duncan Moss <djm.moss@gmail.com>
Date: Fri, 4 Jul 2025 11:58:04 -0700
Subject: [PATCH 099/167] [feat]: CUTLASS block scaled group gemm for SM100
 (#19757)

Signed-off-by: Duncan Moss <djm.moss@gmail.com>
Co-authored-by: Duncan Moss <dmoss@nvidia.com>
---
 CMakeLists.txt                                |  22 +-
 ..._warpspecialized_fp8_blockwise_scaling.hpp |   1 -
 csrc/ops.h                                    |   5 +
 .../cutlass_w8a8/c3x/scaled_mm.cuh            |   3 +-
 .../moe/blockwise_scaled_group_mm_sm100.cu    | 367 ++++++++++++++++++
 .../quantization/machete/machete_mainloop.cuh |   1 -
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh  |   3 +-
 csrc/torch_bindings.cpp                       |   9 +
 .../kernels/moe/test_cutlass_grouped_gemm.py  | 115 ++++++
 vllm/_custom_ops.py                           |  14 +
 .../layers/fused_moe/cutlass_moe.py           | 134 ++++++-
 .../layers/fused_moe/fused_moe.py             |  62 +--
 .../model_executor/layers/quantization/fp8.py |  20 +-
 13 files changed, 726 insertions(+), 30 deletions(-)
 create mode 100644 csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
 create mode 100644 tests/kernels/moe/test_cutlass_grouped_gemm.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9b1daeeed83ec..0129f85123fb9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -259,7 +259,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
   # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
-  set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
 
   # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
   if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -615,6 +615,26 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                      "in CUDA target architectures.")
     endif()
   endif()
+  
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
+    message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+      message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is "
+                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
+                     "if you intend on running FP8 quantized MoE models on Blackwell.")
+    else()
+      message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found "
+                     "in CUDA target architectures")
+    endif()
+  endif()
 
   #
   # Machete kernels
diff --git a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
index d922a3349e1e1..ce7f47cf72337 100644
--- a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+++ b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@@ -45,7 +45,6 @@
 #include "cute/algorithm/functional.hpp"
 #include "cute/atom/mma_atom.hpp"
 #include "cute/algorithm/gemm.hpp"
-#include "cute/tensor_predicate.hpp"
 #include "cute/numeric/arithmetic_tuple.hpp"
 
 #include "cutlass_extensions/gemm/dispatch_policy.hpp"
diff --git a/csrc/ops.h b/csrc/ops.h
index 52c264d64ccad..56e51cc659d86 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -239,6 +239,11 @@ void cutlass_moe_mm(
     torch::Tensor const& b_strides, torch::Tensor const& c_strides,
     bool per_act_token, bool per_out_ch);
 
+void cutlass_blockwise_scaled_grouped_mm(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets);
+
 void cutlass_fp4_group_mm(
     torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
     const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
index 2387ec57e8f2d..2d67da98763e0 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
@@ -51,7 +51,8 @@ struct cutlass_3x_gemm {
   // These are the minimum alignments needed for the kernels to compile
   static constexpr int AlignmentAB =
       128 / cutlass::sizeof_bits<ElementAB>::value;
-  static constexpr int AlignmentCD = 4;
+  static constexpr int AlignmentCD =
+      128 / cutlass::sizeof_bits<ElementD>::value;
 
   using CollectiveEpilogue =
       typename cutlass::epilogue::collective::CollectiveBuilder<
diff --git a/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu b/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
new file mode 100644
index 0000000000000..ef57e503b21ae
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
@@ -0,0 +1,367 @@
+#include <torch/all.h>
+#include <cutlass/arch/arch.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/gett.hpp"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include <cassert>
+
+using namespace cute;
+
+template <typename ElementAB, typename ElementC, typename ElementAccumulator,
+          typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
+__global__ void get_ggemm_starts(
+    int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets,
+    ElementC** out_offsets, ElementAccumulator** a_scale_offsets,
+    ElementAccumulator** b_scale_offsets, ElementAB* a_base_as_int,
+    ElementAB* b_base_as_int, ElementC* out_base_as_int,
+    ElementAccumulator* a_scale_base_as_int,
+    ElementAccumulator* b_scale_base_as_int, LayoutSFA* layout_sfa_base_as_int,
+    LayoutSFB* layout_sfb_base_as_int, int* problem_sizes) {
+  int expert_id = threadIdx.x;
+
+  if (expert_id >= gridDim.x * blockDim.x) {
+    return;
+  }
+
+  int m = problem_sizes[expert_id * 3];
+  int n = problem_sizes[expert_id * 3 + 1];
+  int k = problem_sizes[expert_id * 3 + 2];
+
+  int32_t expert_offset = expert_offsets[expert_id];
+  int a_stride = expert_offset * k;
+  int b_stride = expert_id * k * n;
+  int a_scale_stride = expert_offset * k / 128;
+  int b_scale_stride = expert_id * k * n / 128 / 128;
+
+  a_offsets[expert_id] = a_base_as_int + a_stride;
+  b_offsets[expert_id] = b_base_as_int + b_stride;
+  out_offsets[expert_id] = out_base_as_int + expert_offset * n;
+  a_scale_offsets[expert_id] = a_scale_base_as_int + a_scale_stride;
+  b_scale_offsets[expert_id] = b_scale_base_as_int + b_scale_stride;
+
+  LayoutSFA* layout_sfa_ptr = layout_sfa_base_as_int + expert_id;
+  LayoutSFB* layout_sfb_ptr = layout_sfb_base_as_int + expert_id;
+
+  *layout_sfa_ptr =
+      ScaleConfig::tile_atom_to_shape_SFA(cute::make_shape(m, n, k, 1));
+  *layout_sfb_ptr =
+      ScaleConfig::tile_atom_to_shape_SFB(cute::make_shape(m, n, k, 1));
+}
+
+#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE, LayoutSFA, LayoutSFB, \
+                                 ScaleConfig)                                 \
+  else if (out_tensors.dtype() == TENSOR_C_TYPE) {                            \
+    get_ggemm_starts<cutlass::float_e4m3_t, C_TYPE, float, LayoutSFA,         \
+                     LayoutSFB, ScaleConfig><<<1, num_experts, 0, stream>>>(  \
+        static_cast<int32_t*>(expert_offsets.data_ptr()),                     \
+        static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()),              \
+        static_cast<cutlass::float_e4m3_t**>(b_ptrs.data_ptr()),              \
+        static_cast<C_TYPE**>(out_ptrs.data_ptr()),                           \
+        static_cast<float**>(a_scales_ptrs.data_ptr()),                       \
+        static_cast<float**>(b_scales_ptrs.data_ptr()),                       \
+        static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()),            \
+        static_cast<cutlass::float_e4m3_t*>(b_tensors.data_ptr()),            \
+        static_cast<C_TYPE*>(out_tensors.data_ptr()),                         \
+        static_cast<float*>(a_scales.data_ptr()),                             \
+        static_cast<float*>(b_scales.data_ptr()),                             \
+        reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),                  \
+        reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr()),                  \
+        static_cast<int*>(problem_sizes.data_ptr()));                         \
+  }
+
+template <typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
+void run_get_ggemm_starts(
+    torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
+    torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
+    torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
+    torch::Tensor const& a_tensors, torch::Tensor const& b_tensors,
+    torch::Tensor out_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& layout_sfa,
+    torch::Tensor const& layout_sfb, torch::Tensor const& problem_sizes) {
+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(out_tensors.size(1) % 128 == 0 or out_tensors.size(0) % 128 == 0);
+  TORCH_CHECK(a_tensors.size(1) % 128 == 0 or a_tensors.size(0) % 128 == 0);
+
+  int num_experts = (int)expert_offsets.size(0);
+  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
+
+  if (false) {
+  }
+  __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t, LayoutSFA,
+                           LayoutSFB, ScaleConfig)
+  __CALL_GET_STARTS_KERNEL(torch::kFloat16, cutlass::half_t, LayoutSFA,
+                           LayoutSFB, ScaleConfig)
+  else {
+    TORCH_CHECK(false, "Unsupported output tensor type");
+  }
+}
+
+template <typename OutType, typename ScheduleConfig, typename LayoutD>
+void run_blockwise_scaled_group_mm(
+    torch::Tensor& out_ptrs, const torch::Tensor& a_ptrs,
+    const torch::Tensor& b_ptrs, const torch::Tensor& a_scales_ptrs,
+    const torch::Tensor& b_scales_ptrs, const torch::Tensor& stride_a,
+    const torch::Tensor& stride_b, const torch::Tensor& stride_c,
+    const torch::Tensor& layout_sfa, const torch::Tensor& layout_sfb,
+    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
+  using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;
+
+  // Types
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = OutType;
+  using ElementD = ElementC;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = LayoutD;
+
+  // Alignments
+  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+
+  using ArchTag = cutlass::arch::Sm100;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, typename ScheduleConfig::MmaTileShape,
+          typename ScheduleConfig::ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
+          ElementAccumulator, void, LayoutC*, AlignmentC, ElementD, LayoutC*,
+          AlignmentC, typename ScheduleConfig::EpilogueSchedule>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementA,
+          cute::tuple<LayoutA*, typename ScheduleConfig::LayoutSFA*>,
+          AlignmentA, ElementB,
+          cute::tuple<LayoutB*, typename ScheduleConfig::LayoutSFB*>,
+          AlignmentB, ElementAccumulator, typename ScheduleConfig::MmaTileShape,
+          typename ScheduleConfig::ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          typename ScheduleConfig::KernelSchedule>::CollectiveOp;
+
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
+                                           CollectiveEpilogue, void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
+  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
+  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
+  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
+
+  using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
+  int num_experts = (int)expert_offsets.size(0);
+
+  Gemm gemm_op;
+
+  // Mainloop Arguments
+  typename GemmKernel::MainloopArguments mainloop_args{
+      static_cast<const ElementA**>(a_ptrs.data_ptr()),
+      static_cast<StrideA*>(stride_a.data_ptr()),
+      static_cast<const ElementB**>(b_ptrs.data_ptr()),
+      static_cast<StrideB*>(stride_b.data_ptr()),
+      static_cast<const ElementAccumulator**>(a_scales_ptrs.data_ptr()),
+      reinterpret_cast<typename ScheduleConfig::LayoutSFA*>(
+          layout_sfa.data_ptr()),
+      static_cast<const ElementAccumulator**>(b_scales_ptrs.data_ptr()),
+      reinterpret_cast<typename ScheduleConfig::LayoutSFB*>(
+          layout_sfb.data_ptr())};
+
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = a_ptrs.get_device();
+  hw_info.sm_count =
+      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+          hw_info.device_id);
+
+  // Epilogue Arguments
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {},  // epilogue.thread
+      nullptr,
+      static_cast<StrideC*>(stride_c.data_ptr()),
+      static_cast<ElementD**>(out_ptrs.data_ptr()),
+      static_cast<StrideC*>(stride_c.data_ptr())};
+
+  UnderlyingProblemShape* problem_sizes_as_shapes =
+      static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
+
+  // Gemm Arguments
+  typename GemmKernel::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGrouped,
+      {num_experts, problem_sizes_as_shapes, nullptr},
+      mainloop_args,
+      epilogue_args,
+      hw_info};
+
+  at::cuda::CUDAGuard device_guard{(char)a_ptrs.device().index()};
+  const cudaStream_t stream =
+      at::cuda::getCurrentCUDAStream(a_ptrs.get_device());
+
+  auto can_implement_status = gemm_op.can_implement(args);
+  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
+              "Failed to implement GEMM");
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a_ptrs.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto status = gemm_op.initialize(args, workspace.data_ptr(), stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
+
+  status = gemm_op.run(stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
+}
+
+template <typename OutType>
+void blockwise_scaled_group_mm_dispatch_shape(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
+  struct MmaConfig {
+    using ElementA = cutlass::float_e4m3_t;
+    using KernelSchedule =
+        cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise1SmSm100;
+    using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+    using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
+        1, 128, 128, cute::UMMA::Major::K, cute::UMMA::Major::K>;
+    using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
+    using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
+    using LayoutC = cutlass::layout::RowMajor;
+    using MmaTileShape = Shape<_128, _128, _128>;
+    using ClusterShape = Shape<_1, _1, _1>;
+  };
+
+  int num_experts = (int)expert_offsets.size(0);
+
+  auto a_ptrs = torch::empty(
+      {num_experts},
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+  auto b_ptrs = torch::empty(
+      {num_experts},
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+  auto out_ptrs = torch::empty(
+      {num_experts},
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+  auto a_scales_ptrs = torch::empty(
+      {num_experts},
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+  auto b_scales_ptrs = torch::empty(
+      {num_experts},
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+
+  auto layout_sfa = torch::empty(
+      {num_experts, 5},
+      torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
+  auto layout_sfb = torch::empty(
+      {num_experts, 5},
+      torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
+
+  auto stride_a = torch::full(
+      {num_experts}, a.size(1),
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+  auto stride_b = torch::full(
+      {num_experts}, a.size(1),
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+  auto stride_c = torch::full(
+      {num_experts}, output.size(1),
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
+
+  torch::TensorOptions options_int =
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device());
+
+  run_get_ggemm_starts<typename MmaConfig::LayoutSFA,
+                       typename MmaConfig::LayoutSFB,
+                       typename MmaConfig::ScaleConfig>(
+      expert_offsets, a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, a,
+      b, output, scales_a, scales_b, layout_sfa, layout_sfb, problem_sizes);
+
+  run_blockwise_scaled_group_mm<OutType, MmaConfig,
+                                typename MmaConfig::LayoutC>(
+      out_ptrs, a_ptrs, b_ptrs, a_scales_ptrs, b_scales_ptrs, stride_a,
+      stride_b, stride_c, layout_sfa, layout_sfb, problem_sizes,
+      expert_offsets);
+}
+
+void cutlass_blockwise_scaled_grouped_mm(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
+  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
+  TORCH_CHECK(problem_sizes.size(1) == 3,
+              "problem_sizes must have shape (num_experts, 3)");
+  TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
+              "Number of experts in problem_sizes must match expert_offsets");
+  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
+              "problem_sizes must be int32");
+  TORCH_CHECK(a.scalar_type() == torch::kFloat8_e4m3fn,
+              "a must be kFloat8_e4m3fn");
+  TORCH_CHECK(b.scalar_type() == torch::kFloat8_e4m3fn,
+              "b must be kFloat8_e4m3fn");
+  TORCH_CHECK(output.scalar_type() == torch::kBFloat16 ||
+                  output.scalar_type() == torch::kHalf,
+              "output must be bfloat16 or half");
+  TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32,
+              "scales_a must be float32");
+  TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32,
+              "scales_b must be float32");
+  TORCH_CHECK(expert_offsets.scalar_type() == torch::kInt32,
+              "expert_offsets must be int32");
+
+  TORCH_CHECK(output.dim() == 2, "output must be 2D tensor");
+  TORCH_CHECK(a.dim() == 2, "a must be 2D tensor");
+  TORCH_CHECK(b.dim() == 3, "b must be 3D tensor");
+  TORCH_CHECK(scales_a.dim() == 2, "scales_a must be 2D tensor");
+  TORCH_CHECK(scales_b.dim() == 3, "scales_b must be 3D tensor");
+  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
+  TORCH_CHECK(problem_sizes.size(1) == 3,
+              "problem_sizes must have shape (num_experts, 3)");
+  TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
+              "Number of experts in problem_sizes must match expert_offsets");
+  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
+              "problem_sizes must be int32");
+  TORCH_CHECK(expert_offsets.dim() == 1, "expert_offsets must be 1D tensor");
+
+#if defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100
+  if (output.scalar_type() == torch::kBFloat16) {
+    blockwise_scaled_group_mm_dispatch_shape<cutlass::bfloat16_t>(
+        output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
+  } else if (output.scalar_type() == torch::kFloat16) {
+    blockwise_scaled_group_mm_dispatch_shape<cutlass::half_t>(
+        output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
+  } else {
+    TORCH_CHECK(false, "Unsupported output tensor type");
+  }
+#endif
+}
\ No newline at end of file
diff --git a/csrc/quantization/machete/machete_mainloop.cuh b/csrc/quantization/machete/machete_mainloop.cuh
index eca5d328b00c9..2f52a6b7a0246 100644
--- a/csrc/quantization/machete/machete_mainloop.cuh
+++ b/csrc/quantization/machete/machete_mainloop.cuh
@@ -38,7 +38,6 @@
 #include "cute/atom/mma_atom.hpp"
 #include "cute/atom/copy_traits_sm90_tma.hpp"
 #include "cute/algorithm/gemm.hpp"
-#include "cute/tensor_predicate.hpp"
 #include "cute/numeric/arithmetic_tuple.hpp"
 #include "cutlass/pipeline/pipeline.hpp"
 #include "cutlass/transform/collective/sm90_wgmma_transpose.hpp"
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
index c22523da4e439..637bba1384a47 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -79,7 +79,8 @@ struct cutlass_sparse_3x_gemm {
   // These are the minimum alignments needed for the kernels to compile
   static constexpr int AlignmentAB =
       128 / cutlass::sizeof_bits<ElementAB>::value;
-  static constexpr int AlignmentCD = 4;
+  static constexpr int AlignmentCD =
+      128 / cutlass::sizeof_bits<ElementD>::value;
 
   using CollectiveEpilogue =
       typename cutlass::epilogue::collective::CollectiveBuilder<
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 8bb71cad29dae..04329e75db8c3 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -393,6 +393,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       {stride_tag});
   ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
 
+  // cutlass blockwise scaledgroup GEMM
+  ops.def(
+      "cutlass_blockwise_scaled_grouped_mm(Tensor! output, Tensor a, Tensor b, "
+      "Tensor scales_a, Tensor scales_b, "
+      "Tensor problem_sizes, Tensor expert_offsets) -> ()",
+      {stride_tag});
+  ops.impl("cutlass_blockwise_scaled_grouped_mm", torch::kCUDA,
+           &cutlass_blockwise_scaled_grouped_mm);
+
   // cutlass nvfp4 block scaled group GEMM
   ops.def(
       "cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b,"
diff --git a/tests/kernels/moe/test_cutlass_grouped_gemm.py b/tests/kernels/moe/test_cutlass_grouped_gemm.py
new file mode 100644
index 0000000000000..bf228dcece3c4
--- /dev/null
+++ b/tests/kernels/moe/test_cutlass_grouped_gemm.py
@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepGEMM Style Cutlass Grouped GEMM Test
+# See https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_core.py
+
+import random
+
+import pytest
+import torch
+
+from tests.kernels.utils import baseline_scaled_mm
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+
+
+def cdiv(a, b):
+    return (a + b - 1) // b
+
+
+def per_token_cast_to_fp8(
+        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    pad_size = (128 - (n % 128)) % 128
+    x = torch.nn.functional.pad(x,
+                                (0, pad_size), value=0) if pad_size > 0 else x
+    x_view = x.view(m, -1, 128)
+    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
+    fp8_data = (x_view *
+                (448.0 / x_amax.unsqueeze(2))).to(dtype=torch.float8_e4m3fn)
+    return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
+
+
+def per_block_cast_to_fp8(
+        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros((cdiv(m, 128) * 128, cdiv(n, 128) * 128),
+                           device=x.device,
+                           dtype=x.dtype)
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    x_scaled = (x_view * (448.0 / x_amax)).to(dtype=torch.float8_e4m3fn)
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
+        x_amax / 448.0).view(x_view.size(0), x_view.size(2))
+
+
+@pytest.mark.parametrize("num_groups, expected_m_per_group, k, n", [
+    (4, 8192, 7168, 4096),
+    (4, 8192, 2048, 7168),
+    (8, 4096, 7168, 4096),
+    (8, 4096, 2048, 7168),
+    (32, 1024, 7168, 4096),
+    (32, 1024, 2048, 7168),
+])
+@pytest.mark.parametrize("out_dtype", [torch.float16])
+@pytest.mark.skipif(
+    (lambda x: x is None or x.to_int() != 100)(
+        current_platform.get_device_capability()),
+    reason="Block Scaled Grouped GEMM is only supported on SM100.")
+def test_cutlass_grouped_gemm(
+    num_groups: int,
+    expected_m_per_group: int,
+    k: int,
+    n: int,
+    out_dtype: torch.dtype,
+):
+    device = "cuda"
+    alignment = 128
+    group_ms = [
+        int(expected_m_per_group * random.uniform(0.7, 1.3))
+        for _ in range(num_groups)
+    ]
+    m = sum([cdiv(m, alignment) * alignment for m in group_ms])
+
+    x = torch.randn((m, k), device=device, dtype=out_dtype)
+    y = torch.randn((num_groups, n, k), device=device, dtype=out_dtype)
+    out = torch.empty((m, n), device=device, dtype=out_dtype)
+    ref_out = torch.randn((m, n), device=device, dtype=out_dtype)
+
+    ep_offset = [0] + [sum(group_ms[:i]) for i in range(1, num_groups)] + [m]
+    pb_size = []
+    for i in range(num_groups):
+        pb_size.append([ep_offset[i + 1] - ep_offset[i], n, k])
+    problem_sizes = torch.tensor(pb_size, device=device, dtype=torch.int32)
+    expert_offsets = torch.tensor(ep_offset, device=device, dtype=torch.int32)
+
+    x_fp8 = per_token_cast_to_fp8(x)
+    y_fp8 = (torch.empty_like(y, dtype=torch.float8_e4m3fn),
+             torch.empty((num_groups, cdiv(n, 128), k // 128),
+                         device=device,
+                         dtype=torch.float))
+    for i in range(num_groups):
+        y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i])
+
+    for i in range(num_groups):
+        a = x_fp8[0][ep_offset[i]:ep_offset[i + 1]]
+        a_scale = x_fp8[1][ep_offset[i]:ep_offset[i + 1]]
+        b = y_fp8[0][i].t()
+        b_scale = y_fp8[1][i].t()
+        baseline = baseline_scaled_mm(a, b, a_scale, b_scale, out_dtype)
+        ref_out[ep_offset[i]:ep_offset[i + 1]] = baseline
+
+    ops.cutlass_blockwise_scaled_grouped_mm(
+        out,
+        x_fp8[0],
+        y_fp8[0],
+        x_fp8[1],
+        y_fp8[1],
+        problem_sizes,
+        expert_offsets[:-1],
+    )
+
+    torch.testing.assert_close(ref_out, out, atol=5e-1, rtol=1e-3)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 6b1b3f787c23e..eb9d0b4058927 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -646,6 +646,20 @@ def cutlass_scaled_mm_supports_fp4(cuda_device_capability: int) -> bool:
     return torch.ops._C.cutlass_scaled_mm_supports_fp4(cuda_device_capability)
 
 
+def cutlass_blockwise_scaled_grouped_mm(
+    output: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    scales_a: torch.Tensor,
+    scales_b: torch.Tensor,
+    problem_sizes: torch.Tensor,
+    expert_offsets: torch.Tensor,
+):
+    torch.ops._C.cutlass_blockwise_scaled_grouped_mm(output, a, b, scales_a,
+                                                     scales_b, problem_sizes,
+                                                     expert_offsets)
+
+
 def cutlass_scaled_fp4_mm(a: torch.Tensor, b: torch.Tensor,
                           block_scale_a: torch.Tensor,
                           block_scale_b: torch.Tensor, alpha: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index d889f740a0c4b..431fb290b294b 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -7,12 +7,17 @@ import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
     MoEPrepareAndFinalizeNoEP)
-from vllm.model_executor.layers.fused_moe.utils import _fp8_perm, _resize_cache
+from vllm.model_executor.layers.fused_moe.utils import (_fp8_perm,
+                                                        _fp8_quantize,
+                                                        _resize_cache)
 from vllm.scalar_type import scalar_types
 
+logger = init_logger(__name__)
+
 
 def run_cutlass_moe_fp8(
     output: torch.Tensor,
@@ -508,3 +513,130 @@ def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor,
     out = (c2.view(m, num_topk, k) *
            topk_weights.view(m, num_topk, 1).half()).sum(dim=1)
     return out.to(dtype=out_dtype)
+
+
+def _valid_cutlass_block_scaled_grouped_gemm(hidden_states: torch.Tensor,
+                                             w1: torch.Tensor,
+                                             w2: torch.Tensor) -> bool:
+
+    def _valid_cutlass_block_scaled_grouped_gemm_shape(M: int, N: int, K: int):
+        return M >= 128 and N % 128 == 0 and K % 128 == 0
+
+    m = hidden_states.size(0)
+    _, K, N = w2.size()
+    if not _valid_cutlass_block_scaled_grouped_gemm_shape(m, N, K):
+        logger.debug(
+            "CutlassBlockScaledGroupedGemm disabled: unalinged problem size.")
+        return False
+
+    if (w1.dtype != torch.float8_e4m3fn or w2.dtype != torch.float8_e4m3fn):
+        logger.debug(
+            "CutlassBlockScaledGroupedGemm disabled: invalid weight dtype(s).")
+        return False
+
+    return True
+
+
+def run_cutlass_block_scaled_fused_experts(
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+) -> torch.Tensor:
+    w1_q = w1.transpose(1, 2)
+    w2_q = w2.transpose(1, 2)
+    w1_scale = w1_scale.transpose(1, 2)
+    w2_scale = w2_scale.transpose(1, 2)
+
+    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert a.shape[0] == topk_ids.shape[
+        0], "a and topk_ids must have the same batch size"
+    assert w1_q.dtype == torch.float8_e4m3fn, "w1_q must be float8_e4m3fn"
+    assert w2_q.dtype == torch.float8_e4m3fn, "w2_q must be float8_e4m3fn"
+    assert a.shape[1] == w1_q.shape[1], "Hidden size mismatch w1"
+    assert w1_q.shape[2] == w2_q.shape[1] * 2, "Hidden size mismatch w2"
+    assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
+    assert w1_q.shape[0] == w1_scale.shape[
+        0], "w1_scale expert number mismatch"
+    assert w1_q.shape[0] == w2_scale.shape[
+        0], "w2_scale expert number mismatch"
+    assert a.dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
+
+    out_dtype = a.dtype
+    num_experts = w1_q.size(0)
+    m = a.size(0)
+    k = w1_q.size(1)
+    n = w2_q.size(1)
+
+    expert_offsets = torch.empty((num_experts + 1, ),
+                                 dtype=torch.int32,
+                                 device="cuda")
+    problem_sizes1 = torch.empty((num_experts, 3),
+                                 dtype=torch.int32,
+                                 device="cuda")
+    problem_sizes2 = torch.empty((num_experts, 3),
+                                 dtype=torch.int32,
+                                 device="cuda")
+
+    topk = topk_ids.size(1)
+
+    a_q, a1_scale = _fp8_quantize(a,
+                                  A_scale=None,
+                                  per_act_token=False,
+                                  block_shape=[128, 128])
+    device = a_q.device
+
+    a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+    c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+
+    ops.get_cutlass_moe_mm_data(
+        topk_ids,
+        expert_offsets,
+        problem_sizes1,
+        problem_sizes2,
+        a_map,
+        c_map,
+        num_experts,
+        n,
+        k,
+    )
+
+    rep_a_q = a_q.view(dtype=torch.uint8)[a_map].view(dtype=a_q.dtype)
+    rep_a1_scales = a1_scale[a_map]
+
+    c1 = torch.empty((m * topk, n * 2), dtype=out_dtype, device=device)
+    c2 = torch.empty((m * topk, k), dtype=out_dtype, device=device)
+
+    ops.cutlass_blockwise_scaled_grouped_mm(
+        c1,
+        rep_a_q,
+        w1_q,
+        rep_a1_scales,
+        w1_scale,
+        problem_sizes1,
+        expert_offsets[:-1],
+    )
+
+    intermediate = torch.empty((m * topk, n), dtype=out_dtype, device=device)
+    torch.ops._C.silu_and_mul(intermediate, c1)
+
+    intermediate_q, a2_scale = _fp8_quantize(intermediate,
+                                             A_scale=None,
+                                             per_act_token=False,
+                                             block_shape=[128, 128])
+
+    ops.cutlass_blockwise_scaled_grouped_mm(
+        c2,
+        intermediate_q,
+        w2_q,
+        a2_scale,
+        w2_scale,
+        problem_sizes2,
+        expert_offsets[:-1],
+    )
+
+    return (c2[c_map].view(m, topk, k) *
+            topk_weights.view(m, topk, 1).to(out_dtype)).sum(dim=1)
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 041819bb7b08a..fbbccbb34d902 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -15,6 +15,9 @@ from vllm.logger import init_logger
 # yapf: disable
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig, get_config_quant_dtype)
+from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+    _valid_cutlass_block_scaled_grouped_gemm,
+    run_cutlass_block_scaled_fused_experts)
 # yapf: enable
 from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
     _valid_deep_gemm, deep_gemm_moe_fp8)
@@ -1129,29 +1132,31 @@ def dispatch_fused_experts_func(inplace: bool) -> Callable[..., torch.Tensor]:
 
 # TODO (bnell): replace this with modular op.  Can get rid of inplace/outplace
 # torch ops.
-def fused_experts(hidden_states: torch.Tensor,
-                  w1: torch.Tensor,
-                  w2: torch.Tensor,
-                  topk_weights: torch.Tensor,
-                  topk_ids: torch.Tensor,
-                  inplace: bool = False,
-                  activation: str = "silu",
-                  apply_router_weight_on_input: bool = False,
-                  use_fp8_w8a8: bool = False,
-                  use_int8_w8a8: bool = False,
-                  use_int8_w8a16: bool = False,
-                  use_int4_w4a16: bool = False,
-                  per_channel_quant: bool = False,
-                  global_num_experts: int = -1,
-                  expert_map: Optional[torch.Tensor] = None,
-                  w1_scale: Optional[torch.Tensor] = None,
-                  w2_scale: Optional[torch.Tensor] = None,
-                  w1_zp: Optional[torch.Tensor] = None,
-                  w2_zp: Optional[torch.Tensor] = None,
-                  a1_scale: Optional[torch.Tensor] = None,
-                  a2_scale: Optional[torch.Tensor] = None,
-                  block_shape: Optional[list[int]] = None,
-                  allow_deep_gemm: bool = False) -> torch.Tensor:
+def fused_experts(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        inplace: bool = False,
+        activation: str = "silu",
+        apply_router_weight_on_input: bool = False,
+        use_fp8_w8a8: bool = False,
+        use_int8_w8a8: bool = False,
+        use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
+        per_channel_quant: bool = False,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        w1_scale: Optional[torch.Tensor] = None,
+        w2_scale: Optional[torch.Tensor] = None,
+        w1_zp: Optional[torch.Tensor] = None,
+        w2_zp: Optional[torch.Tensor] = None,
+        a1_scale: Optional[torch.Tensor] = None,
+        a2_scale: Optional[torch.Tensor] = None,
+        block_shape: Optional[list[int]] = None,
+        allow_deep_gemm: bool = False,
+        allow_cutlass_block_scaled_grouped_gemm: bool = False) -> torch.Tensor:
     # For now, disable DeepGemm for small N (<= 512) until better
     # permute/unpermute ops are available.
     N = w1.size(1)
@@ -1174,6 +1179,17 @@ def fused_experts(hidden_states: torch.Tensor,
             a2_scale=a2_scale,
             apply_router_weight_on_input=apply_router_weight_on_input,
         )
+    elif (allow_cutlass_block_scaled_grouped_gemm and use_fp8_w8a8
+          and _valid_cutlass_block_scaled_grouped_gemm(hidden_states, w1, w2)):
+        assert apply_router_weight_on_input is False
+        return run_cutlass_block_scaled_fused_experts(
+            a=hidden_states,
+            w1=w1,
+            w2=w2,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids)
     else:
         return dispatch_fused_experts_func(inplace)(
             hidden_states=hidden_states,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index a7d221780beec..5a1a427d7d72e 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -473,12 +473,30 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 logger.warning_once(
                     "DeepGemm not supported on the current platform.")
 
+        # Check for CutlassBlockScaledGroupedGemm support.
+        self.allow_cutlass_block_scaled_grouped_gemm = False
+        if not self.block_quant:
+            logger.warning_once("Model is not block quantized. Not using "
+                                "CutlassBlockScaledGroupedGemm kernels")
+        elif (current_platform.is_cuda()
+              and current_platform.has_device_capability(100)):
+            logger.info_once(
+                "Using CutlassBlockScaledGroupedGemm kernels for Fp8MoEMethod."
+            )
+            self.allow_cutlass_block_scaled_grouped_gemm = True
+        else:
+            logger.warning_once(
+                "CutlassBlockScaledGroupedGemm not supported on the current "
+                "platform.")
+
         self.topk_indices_dtype = None
         self.fused_experts = functools.partial(  # type: ignore
             fused_experts,
             use_fp8_w8a8=True,
             block_shape=self.quant_config.weight_block_size,
-            allow_deep_gemm=self.allow_deep_gemm)
+            allow_deep_gemm=self.allow_deep_gemm,
+            allow_cutlass_block_scaled_grouped_gemm=(
+                self.allow_cutlass_block_scaled_grouped_gemm))
 
     def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
                        intermediate_size_per_partition: int,

From c108781c856eb5b89e927b1bf667a724e405b2ac Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Sat, 5 Jul 2025 06:17:30 +0900
Subject: [PATCH 100/167] [CI Bugfix] Fix pre-commit failures on main (#20502)

---
 tests/kernels/moe/test_cutlass_grouped_gemm.py | 1 +
 vllm/multimodal/utils.py                       | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/kernels/moe/test_cutlass_grouped_gemm.py b/tests/kernels/moe/test_cutlass_grouped_gemm.py
index bf228dcece3c4..67984fe7319a3 100644
--- a/tests/kernels/moe/test_cutlass_grouped_gemm.py
+++ b/tests/kernels/moe/test_cutlass_grouped_gemm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # DeepGEMM Style Cutlass Grouped GEMM Test
 # See https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_core.py
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 6a2f7d159a69c..6a4a998c7e594 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -48,9 +48,10 @@ class MediaConnector:
             media_io_kwargs: Additional args passed to process media 
                              inputs, keyed by modalities. For example, 
                              to set num_frames for video, set 
-                             `--media-io-kwargs '{"video": {"num_frames": 40} }'`
+                             `--media-io-kwargs '{"video":{"num_frames":40}}'`
             connection: HTTP connection client to download media contents.
-            allowed_local_media_path: A local directory to load media files from.
+            allowed_local_media_path: A local directory to load media files
+                                      from.
         """
         super().__init__()
 

From d3f05c9248d79dc900c79d090db16cc2e5d96ee3 Mon Sep 17 00:00:00 2001
From: Guy Stone <GuyStoneExtra@gmail.com>
Date: Sat, 5 Jul 2025 00:41:35 +0100
Subject: [PATCH 101/167] [Doc] fix mutltimodal_inputs.md gh examples link
 (#20497)

Signed-off-by: Guy Stone <guys@spotify.com>
---
 docs/features/multimodal_inputs.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index e3a77afb02f1c..c45d620dc8e0e 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -228,7 +228,7 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
     If no default chat template is available, we will first look for a built-in fallback in <gh-file:vllm/transformers_utils/chat_templates/registry.py>.
     If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
 
-    For certain models, we provide alternative chat templates inside <gh-dir:vllm/examples>.
+    For certain models, we provide alternative chat templates inside <gh-dir:examples>.
     For example, VLM2Vec uses <gh-file:examples/template_vlm2vec.jinja> which is different from the default one for Phi-3-Vision.
 
 ### Image Inputs

From 7e908704919376cea9ff0f361596c1a0b14db1a3 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sat, 5 Jul 2025 11:52:13 +0800
Subject: [PATCH 102/167] [Misc] Add security warning for development mode
 endpoints (#20508)

Signed-off-by: reidliu41 <reid201711@gmail.com>
---
 vllm/entrypoints/openai/api_server.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 917b3bbbb982f..6c0a95ebb1ee7 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -910,6 +910,8 @@ TASK_HANDLERS: dict[str, dict[str, tuple]] = {
 }
 
 if envs.VLLM_SERVER_DEV_MODE:
+    logger.warning("SECURITY WARNING: Development endpoints are enabled! "
+                   "This should NOT be used in production!")
 
     @router.get("/server_info")
     async def show_server_info(raw_request: Request):

From ef9a2990ae3769692e25920b98f13a0b7b84bd52 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sat, 5 Jul 2025 11:56:39 +0800
Subject: [PATCH 103/167] [doc] small fix (#20506)

Signed-off-by: reidliu41 <reid201711@gmail.com>
---
 docs/contributing/incremental_build.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/contributing/incremental_build.md b/docs/contributing/incremental_build.md
index 14c3aaead51ed..33584fdd5d401 100644
--- a/docs/contributing/incremental_build.md
+++ b/docs/contributing/incremental_build.md
@@ -14,7 +14,7 @@ Before setting up the incremental build:
     VLLM_USE_PRECOMPILED=1 uv pip install -U -e . --torch-backend=auto
     ```
 
-2. **CUDA Toolkit:** Verify that the NVIDIA CUDA Toolkit is correctly installed and `nvcc` is accessible in your `PATH`. CMake relies on `nvcc` to compile CUDA code. You can typically find `nvcc` in `$CUDA_HOME/bin/nvcc` or by running `which nvcc`. If you encounter issues, refer to the [official CUDA Toolkit installation guides](https://developer.nvidia.com/cuda-toolkit-archive) and vLLM's main [GPU installation documentation](../getting_started/installation/gpu/cuda.inc.md#troubleshooting) for troubleshooting. The `CMAKE_CUDA_COMPILER` variable in your `CMakeUserPresets.json` should also point to your `nvcc` binary.
+2. **CUDA Toolkit:** Verify that the NVIDIA CUDA Toolkit is correctly installed and `nvcc` is accessible in your `PATH`. CMake relies on `nvcc` to compile CUDA code. You can typically find `nvcc` in `$CUDA_HOME/bin/nvcc` or by running `which nvcc`. If you encounter issues, refer to the [official CUDA Toolkit installation guides](https://developer.nvidia.com/cuda-toolkit-archive) and vLLM's main [GPU installation documentation](../getting_started/installation/gpu.md#troubleshooting) for troubleshooting. The `CMAKE_CUDA_COMPILER` variable in your `CMakeUserPresets.json` should also point to your `nvcc` binary.
 
 3. **Build Tools:** It is highly recommended to install `ccache` for fast rebuilds by caching compilation results (e.g., `sudo apt install ccache` or `conda install ccache`). Also, ensure the core build dependencies like `cmake` and `ninja` are installed. These are installable through `requirements/build.txt` or your system's package manager.
 

From 906e05d8405f682797b5c934ec84d3c272a5fafd Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 5 Jul 2025 13:48:16 +0800
Subject: [PATCH 104/167] [Misc] Remove the unused LoRA test code (#20494)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/conftest.py   | 17 -----------------
 vllm/multimodal/utils.py |  2 +-
 2 files changed, 1 insertion(+), 18 deletions(-)

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 4908f9a060f7f..881d5efa69193 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -249,23 +249,6 @@ def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
            model_runner.model)
 
 
-@pytest.fixture(params=[True, False])
-def run_with_both_engines_lora(request, monkeypatch):
-    # Automatically runs tests twice, once with V1 and once without
-    use_v1 = request.param
-    # Tests decorated with `@skip_v1` are only run without v1
-    skip_v1 = request.node.get_closest_marker("skip_v1")
-
-    if use_v1:
-        if skip_v1:
-            pytest.skip("Skipping test on vllm V1")
-        monkeypatch.setenv('VLLM_USE_V1', '1')
-    else:
-        monkeypatch.setenv('VLLM_USE_V1', '0')
-
-    yield
-
-
 @pytest.fixture
 def reset_default_device():
     """
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 6a4a998c7e594..8dfbc6503520e 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -489,4 +489,4 @@ def fetch_video(
         "video": video_io_kwargs
     }
     media_connector = MediaConnector(media_io_kwargs=media_io_kwargs)
-    return media_connector.fetch_video(video_url)
+    return media_connector.fetch_video(video_url)
\ No newline at end of file

From 8aeaa910a2c514022ab7521ce74eec0734886a1b Mon Sep 17 00:00:00 2001
From: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Date: Sat, 5 Jul 2025 14:03:20 +0800
Subject: [PATCH 105/167] Fix unknown attribute of topk_indices_dtype in
 CompressedTensorsW8A8Fp8MoECutlassMethod (#20507)

Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com>
---
 .../quantization/compressed_tensors/compressed_tensors_moe.py   | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 48eeda5450b0b..ef67cc0eda466 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -368,6 +368,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             "weights")
         self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
             "input_activations")
+        self.topk_indices_dtype = None
 
         per_tensor = (self.weight_quant.strategy == QuantizationStrategy.TENSOR
                       and self.input_quant.strategy
@@ -738,6 +739,7 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
 
         from vllm.model_executor.layers.fused_moe.cutlass_moe import (
             cutlass_moe_fp8)
+        self.topk_indices_dtype = None
         self.fused_experts = cutlass_moe_fp8  # type: ignore
         self.disable_expert_map = False
 

From 32c9be2200a26589b356fe333246b33b5006f4d7 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 5 Jul 2025 17:41:10 +0800
Subject: [PATCH 106/167] [v1] Re-add fp32 support to v1 engine through
 FlexAttention (#19754)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .github/workflows/lint-and-deploy.yaml        |  2 +-
 .../attention/test_attention_selector.py      | 28 +++++++++++++++++++
 tests/v1/worker/test_gpu_model_runner.py      |  5 ++++
 vllm/engine/arg_utils.py                      |  7 -----
 .../model_loader/tensorizer_loader.py         |  8 ++++--
 vllm/platforms/cuda.py                        |  4 +++
 vllm/v1/attention/backends/flex_attention.py  | 12 +++++++-
 vllm/v1/sample/ops/topk_topp_sampler.py       |  5 +++-
 8 files changed, 59 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index 64011922ad825..74a7a3a3530f5 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -68,7 +68,7 @@ jobs:
           export AWS_ACCESS_KEY_ID=minioadmin
           export AWS_SECRET_ACCESS_KEY=minioadmin
           sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
-          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
+          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set image.env[2].name=VLLM_CPU_CI_ENV --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string image.env[2].value="1" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
 
       - name: curl test
         run: |
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index a8ed749ba13b5..0437bb8293cea 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -181,6 +181,34 @@ def test_env(
                     assert backend.get_name() == expected
 
 
+@pytest.mark.parametrize("device", ["cpu", "cuda"])
+@pytest.mark.parametrize("use_v1", [True, False])
+def test_fp32_fallback(
+    device: str,
+    use_v1: bool,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """Test attention backend selection with fp32."""
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+
+        if device == "cpu":
+            with patch("vllm.attention.selector.current_platform",
+                       CpuPlatform()):
+                backend = get_attn_backend(16, torch.float32, torch.float32,
+                                           16, False)
+            assert (backend.get_name() == "TORCH_SDPA_VLLM_V1"
+                    if use_v1 else "TORCH_SDPA")
+
+        elif device == "cuda":
+            with patch("vllm.attention.selector.current_platform",
+                       CudaPlatform()):
+                backend = get_attn_backend(16, torch.float32, torch.float32,
+                                           16, False)
+            assert (backend.get_name() == "FLEX_ATTENTION"
+                    if use_v1 else "XFORMERS")
+
+
 def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
     """Test FlashAttn validation."""
     # TODO: When testing for v1, pipe in `use_v1` as an argument to
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 2e1deecbd9e67..d13df553db623 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -450,6 +450,7 @@ def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2):
 
 
 def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
+    torch.set_default_dtype(torch.float16)
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     error_msg = f"{layer_1} must come before the current layer"
@@ -478,6 +479,7 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
 
 
 def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
+    torch.set_default_dtype(torch.float16)
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     invalid_layer = "model.layers.0.cross_attn.attn"
@@ -506,6 +508,7 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
 
 
 def test_init_kv_cache_with_kv_sharing_target_same_as_current():
+    torch.set_default_dtype(torch.float16)
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     error_msg = f"{layer_1} cannot be the same as the current layer"
@@ -534,6 +537,7 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current():
 
 
 def test_init_kv_cache_without_kv_sharing():
+    torch.set_default_dtype(torch.float16)
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     vllm_config = get_vllm_config()
@@ -601,6 +605,7 @@ def test_init_kv_cache_without_kv_sharing():
 
 
 def test_init_kv_cache_with_kv_sharing_valid():
+    torch.set_default_dtype(torch.float16)
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     vllm_config = get_vllm_config()
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 284f092361311..cf94b6a642818 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1393,13 +1393,6 @@ class EngineArgs:
                                recommend_to_remove=False)
             return False
 
-        # Only Fp16 and Bf16 dtypes since we only support FA.
-        V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
-        if model_config.dtype not in V1_SUPPORTED_DTYPES:
-            _raise_or_fallback(feature_name=f"--dtype {model_config.dtype}",
-                               recommend_to_remove=False)
-            return False
-
         # No Mamba or Encoder-Decoder so far.
         if not model_config.is_v1_compatible:
             _raise_or_fallback(feature_name=model_config.architectures,
diff --git a/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py
index b9982f312fe52..0b62e744e445c 100644
--- a/vllm/model_executor/model_loader/tensorizer_loader.py
+++ b/vllm/model_executor/model_loader/tensorizer_loader.py
@@ -104,8 +104,12 @@ class TensorizerLoader(BaseModelLoader):
 
         if is_vllm_tensorized(self.tensorizer_config):
             tensorizer_config = self._patch_tensorizer_config(model_config)
-            model = init_tensorizer_model(tensorizer_config=tensorizer_config,
-                                          vllm_config=vllm_config)
+            device_config = vllm_config.device_config
+            with set_default_torch_dtype(model_config.dtype):
+                with torch.device(device_config.device):
+                    model = init_tensorizer_model(
+                        tensorizer_config=tensorizer_config,
+                        vllm_config=vllm_config)
             self.load_weights(model, model_config)
             return model
         return self._load_model_serialized_cpu(vllm_config=vllm_config)
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 15cab757d2c03..f82c1e569977c 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -251,6 +251,10 @@ class CudaPlatformBase(Platform):
 
             # Default backends for V1 engine
             # Prefer FlashInfer for Blackwell GPUs if installed
+            if dtype not in (torch.float16, torch.bfloat16):
+                logger.info_once(
+                    f"Using FlexAttenion backend for {dtype} on V1 engine.")
+                return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"  # noqa: E501
             if cls.is_device_capability(100):
                 try:
                     import flashinfer  # noqa: F401
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index f5407aaeb54f8..ebd5914ee40ac 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -463,6 +463,13 @@ class FlexAttentionImpl(AttentionImpl):
         query = query[:, :, :num_actual_tokens, :]
         # Doesn't work for now -> constraint violation
         # torch._dynamo.try_mark_dynamic(query, 2)
+
+        # default M=64, N=64 may run out of shared memory on
+        # some GPUs with fp32, so we use smaller M and N.
+        extra_kernel_options = {
+            "BLOCK_M": 32,
+            "BLOCK_N": 32
+        } if query.dtype == torch.float32 else {}
         out = flex_attention_compiled(
             query,
             key_cache,
@@ -471,7 +478,10 @@ class FlexAttentionImpl(AttentionImpl):
             attn_metadata.block_mask,
             self.scale,
             enable_gqa=enable_gqa,
-            kernel_options={"FORCE_USE_FLEX_ATTENTION": True},
+            kernel_options={
+                "FORCE_USE_FLEX_ATTENTION": True,
+                **extra_kernel_options
+            },
         )
 
         # Flex doesn't have an out variant today, rely on epilogue fusion
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 30396f1594337..87a84e5bf4350 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -101,7 +101,10 @@ class TopKTopPSampler(nn.Module):
                            "per-request generators. Falling back to "
                            "PyTorch-native implementation.")
             return self.forward_native(logits, generators, k, p)
-        return flashinfer_sample(logits, k, p, generators)
+        # flashinfer sampling functions expect contiguous logits.
+        # In flex_attn/triton_attn fp32 inference, logits can be non-contiguous
+        # because of slicing operation in logits_processor.
+        return flashinfer_sample(logits.contiguous(), k, p, generators)
 
     def forward_tpu(
         self,

From cf4cd53982161cb67c86b69f4a319625b231b0bd Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sat, 5 Jul 2025 22:24:32 +0800
Subject: [PATCH 107/167] [Misc] Add logger.exception for TPU information
 collection failures (#20510)

Signed-off-by: reidliu41 <reid201711@gmail.com>
---
 vllm/usage/usage_lib.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index c149637635b77..92245498de657 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -20,9 +20,12 @@ import torch
 
 import vllm.envs as envs
 from vllm.connections import global_http_connection
+from vllm.logger import init_logger
 from vllm.utils import cuda_device_count_stateless, cuda_get_device_properties
 from vllm.version import __version__ as VLLM_VERSION
 
+logger = init_logger(__name__)
+
 _config_home = envs.VLLM_CONFIG_ROOT
 _USAGE_STATS_JSON_PATH = os.path.join(_config_home, "usage_stats.json")
 _USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home, "do_not_track")
@@ -183,7 +186,7 @@ class UsageMessage:
                 self.gpu_memory_per_device = (
                     torch_xla.core.xla_model.get_memory_info()["bytes_limit"])
             except Exception:
-                pass
+                logger.exception("Failed to collect TPU information")
         self.provider = _detect_cloud_provider()
         self.architecture = platform.machine()
         self.platform = platform.platform()

From 8d763cb891245278fba3c458c2e7d05380b559f6 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sun, 6 Jul 2025 10:17:06 +0800
Subject: [PATCH 108/167] [Misc] remove unused import (#20517)

Signed-off-by: reidliu41 <reid201711@gmail.com>
---
 vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
index aff6279c9bd82..9392e4f0e1dc2 100644
--- a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
@@ -3,7 +3,7 @@
 # ruff: noqa
 import json
 from collections.abc import Sequence
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Optional, Union
 
 import regex as re
 

From c5ebe040ac2871bb41587df1357db867aba709a8 Mon Sep 17 00:00:00 2001
From: Jeremy Reizenstein <669761+bottler@users.noreply.github.com>
Date: Sun, 6 Jul 2025 03:37:59 +0100
Subject: [PATCH 109/167] test_attention compat with coming xformers change
 (#20487)

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 tests/kernels/attention/test_attention.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py
index 7269d19183bf2..2e0b4efebfdb1 100644
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -450,7 +450,8 @@ def test_multi_query_kv_attention(
             start += seq_len
         # xformers.AttentionBias to Tensor for use in reference impl.
         alibi_bias = [
-            b.materialize(b.shape, device=device).squeeze() for b in attn_bias
+            b.materialize((1, num_query_heads, i, i), device=device).squeeze()
+            for b, i in zip(attn_bias, seq_lens)
         ]
     else:
         attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)

From f73d02aadcc0c7f7b300528b32a92dca08e82138 Mon Sep 17 00:00:00 2001
From: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com>
Date: Sun, 6 Jul 2025 06:38:02 +0400
Subject: [PATCH 110/167] [BUG] Fix #20484. Support empty sequence in cuda
 penalty kernel (#20491)

Signed-off-by: Vadim Gimpelson <vadim.gimpelson@centml.ai>
---
 csrc/sampler.cu                               |  2 +
 .../test_apply_repetition_penalties.py        | 48 +++++++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/csrc/sampler.cu b/csrc/sampler.cu
index ee5793dda0ef8..b0cce2e98d221 100644
--- a/csrc/sampler.cu
+++ b/csrc/sampler.cu
@@ -59,6 +59,8 @@ void apply_repetition_penalties_(
   int vocab_size = logits.size(-1);
   int num_seqs = logits.size(0);
 
+  if (num_seqs == 0) return;
+
   // Get number of SMs on the current device
   int sms = 0;
   cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount,
diff --git a/tests/kernels/test_apply_repetition_penalties.py b/tests/kernels/test_apply_repetition_penalties.py
index 5df52dc42f0a9..90380b872d6c2 100644
--- a/tests/kernels/test_apply_repetition_penalties.py
+++ b/tests/kernels/test_apply_repetition_penalties.py
@@ -75,3 +75,51 @@ def test_apply_repetition_penalties(
     # Test the operator by applying the opcheck utility
     opcheck(torch.ops._C.apply_repetition_penalties_,
             (logits.clone(), prompt_mask, output_mask, repetition_penalties))
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="This test for checking CUDA kernel")
+@torch.inference_mode()
+def test_apply_repetition_penalties_zero_seqs() -> None:
+    """
+    Test the apply_repetition_penalties custom op with num_seqs=0
+    against a reference implementation.
+    """
+    num_seqs = 0
+    vocab_size = 17
+    repetition_penalty = 1.05
+    dtype = torch.float32
+    seed = 0
+
+    current_platform.seed_everything(seed)
+    torch.set_default_device("cuda:0")
+
+    # Create test data
+    logits = torch.randn(num_seqs, vocab_size, dtype=dtype)
+
+    # Create masks with some random tokens marked as repeated
+    prompt_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
+    output_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
+
+    # No tokens to mark as repeated since num_seqs=0
+
+    # Create repetition penalties tensor
+    repetition_penalties = torch.full((num_seqs, ),
+                                      repetition_penalty,
+                                      dtype=dtype)
+
+    # Run all three implementations
+    logits_torch = logits.clone()
+    logits_cuda = logits.clone()
+
+    apply_repetition_penalties_torch(logits_torch, prompt_mask, output_mask,
+                                     repetition_penalties)
+    apply_repetition_penalties_cuda(logits_cuda, prompt_mask, output_mask,
+                                    repetition_penalties)
+
+    # Compare all outputs to reference
+    torch.testing.assert_close(logits_torch, logits_cuda, rtol=1e-3, atol=1e-3)
+
+    # Test the operator by applying the opcheck utility
+    opcheck(torch.ops._C.apply_repetition_penalties_,
+            (logits.clone(), prompt_mask, output_mask, repetition_penalties))

From 432870829d5143840c45296b8c1f34e5f561fa85 Mon Sep 17 00:00:00 2001
From: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Date: Sun, 6 Jul 2025 12:08:30 +0800
Subject: [PATCH 111/167] [Bugfix] Fix missing per_act_token parameter in
 compressed_tensors_moe (#20509)

Signed-off-by: Lu Fang <fanglu@fb.com>
---
 vllm/model_executor/layers/fused_moe/cutlass_moe.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index 431fb290b294b..0f41414c4896d 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -322,7 +322,7 @@ def cutlass_moe_fp8(
     topk_ids: torch.Tensor,
     w1_scale: torch.Tensor,
     w2_scale: torch.Tensor,
-    per_act_token: bool,
+    per_act_token: Optional[bool] = None,
     activation: str = "silu",
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
@@ -366,6 +366,9 @@ def cutlass_moe_fp8(
     Returns:
     - torch.Tensor: The fp16 output tensor after applying the MoE layer.
     """
+    if per_act_token is None:
+        per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
+            a2_scale.numel() != 1 if a2_scale is not None else False)
     per_out_ch = w1_scale.numel() != w1_q.size(0)
 
     num_experts = global_num_experts if global_num_experts != -1 else w1_q.size(

From 40b86aa05e4458bf28f038666942d620d89c8c3d Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Sun, 6 Jul 2025 00:17:30 -0400
Subject: [PATCH 112/167] [BugFix] Fix: ImportError when building on hopper
 systems (#20513)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 .github/CODEOWNERS                                       | 2 +-
 csrc/ops.h                                               | 5 -----
 .../cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu  | 9 ++++++++-
 csrc/torch_bindings.cpp                                  | 3 +--
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index da7f89747a16d..2acb03d52a67c 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -16,7 +16,7 @@
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm
 /vllm/entrypoints @aarnphm
-CMakeLists.txt @tlrmchlsmth
+CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 
 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
diff --git a/csrc/ops.h b/csrc/ops.h
index 56e51cc659d86..52c264d64ccad 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -239,11 +239,6 @@ void cutlass_moe_mm(
     torch::Tensor const& b_strides, torch::Tensor const& c_strides,
     bool per_act_token, bool per_out_ch);
 
-void cutlass_blockwise_scaled_grouped_mm(
-    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
-    const torch::Tensor& scales_a, const torch::Tensor& scales_b,
-    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets);
-
 void cutlass_fp4_group_mm(
     torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
     const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
diff --git a/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu b/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
index ef57e503b21ae..236d76ed52081 100644
--- a/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
@@ -1,3 +1,5 @@
+#include "core/registration.h"
+
 #include <torch/all.h>
 #include <cutlass/arch/arch.h>
 
@@ -364,4 +366,9 @@ void cutlass_blockwise_scaled_grouped_mm(
     TORCH_CHECK(false, "Unsupported output tensor type");
   }
 #endif
-}
\ No newline at end of file
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("cutlass_blockwise_scaled_grouped_mm",
+         &cutlass_blockwise_scaled_grouped_mm);
+}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 04329e75db8c3..9414e26196b28 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -399,8 +399,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "Tensor scales_a, Tensor scales_b, "
       "Tensor problem_sizes, Tensor expert_offsets) -> ()",
       {stride_tag});
-  ops.impl("cutlass_blockwise_scaled_grouped_mm", torch::kCUDA,
-           &cutlass_blockwise_scaled_grouped_mm);
+  // conditionally compiled so impl registration is in source file
 
   // cutlass nvfp4 block scaled group GEMM
   ops.def(

From 4548c03c50d8ef067b296fb8d610f9d8a8178482 Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Sat, 5 Jul 2025 21:19:09 -0700
Subject: [PATCH 113/167] [TPU][Bugfix] fix the MoE OOM issue (#20339)

Signed-off-by: Chengji Yao <chengjiyao@google.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 648dfca374c5b..36ac75a8df4b8 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1320,8 +1320,13 @@ class FusedMoE(torch.nn.Module):
 
     def forward(self, hidden_states: torch.Tensor,
                 router_logits: torch.Tensor):
-        return torch.ops.vllm.moe_forward(hidden_states, router_logits,
-                                          self.layer_name)
+        # TODO: Once the OOM issue for the TPU backend is resolved, we will
+        # switch to using the moe_forward custom op.
+        if current_platform.is_tpu():
+            return self.forward_impl(hidden_states, router_logits)
+        else:
+            return torch.ops.vllm.moe_forward(hidden_states, router_logits,
+                                              self.layer_name)
 
     def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
                              full_router_logits: torch.Tensor):

From fe1e924811b9ab172d01100cef50070d6528028a Mon Sep 17 00:00:00 2001
From: Flora Feng <4florafeng@gmail.com>
Date: Sat, 5 Jul 2025 23:47:13 -0700
Subject: [PATCH 114/167] [Frontend] Support image object in llm.chat (#19635)

Signed-off-by: sfeng33 <4florafeng@gmail.com>
Signed-off-by: Flora Feng <4florafeng@gmail.com>
---
 docs/features/multimodal_inputs.md          | 43 ++++++++++++++++++
 examples/offline_inference/mistral-small.py |  7 ++-
 tests/entrypoints/test_chat_utils.py        | 12 ++----
 vllm/entrypoints/chat_utils.py              | 48 +++++++++++++++++++--
 4 files changed, 97 insertions(+), 13 deletions(-)

diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index c45d620dc8e0e..ed11d28360378 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -101,6 +101,49 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
 
 Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
 
+If using the [LLM.chat](https://docs.vllm.ai/en/stable/models/generative_models.html#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:
+
+```python
+from vllm import LLM
+from vllm.assets.image import ImageAsset
+
+llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+image_url = "https://picsum.photos/id/32/512/512"
+image_pil = ImageAsset('cherry_blossom').pil_image
+image_embeds = torch.load(...)
+
+conversation = [
+    {"role": "system", "content": "You are a helpful assistant"},
+    {"role": "user", "content": "Hello"},
+    {"role": "assistant", "content": "Hello! How can I assist you today?"},
+    {
+        "role": "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        },{
+            "type": "image_pil",
+            "image_pil": image_pil
+        }, {
+            "type": "image_embeds",
+            "image_embeds": image_embeds
+        }, {
+            "type": "text",
+            "text": "What's in these images?"
+        }],
+    },
+]
+
+# Perform inference and log output.
+outputs = llm.chat(conversation)
+
+for o in outputs:
+    generated_text = o.outputs[0].text
+    print(generated_text)
+```
+
 Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
 
 ??? Code
diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py
index 330103d5818a3..a38fc9216d403 100644
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@@ -6,6 +6,7 @@ import argparse
 
 from vllm import LLM
 from vllm.sampling_params import SamplingParams
+from vllm.assets.image import ImageAsset
 
 # This script is an offline demo for running Mistral-Small-3.1
 #
@@ -71,14 +72,16 @@ def run_simple_demo(args: argparse.Namespace):
     )
 
     prompt = "Describe this image in one sentence."
-    image_url = "https://picsum.photos/id/237/200/300"
 
     messages = [
         {
             "role": "user",
             "content": [
                 {"type": "text", "text": prompt},
-                {"type": "image_url", "image_url": {"url": image_url}},
+                {
+                    "type": "image_pil",
+                    "image_pil": ImageAsset("cherry_blossom").pil_image,
+                },
             ],
         },
     ]
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 49294664275a0..e41ea686e9927 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -264,10 +264,8 @@ def test_parse_chat_messages_multiple_images(
                     "url": image_url
                 }
             }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
+                "type": "image_pil",
+                "image_pil": ImageAsset('cherry_blossom').pil_image
             }, {
                 "type": "text",
                 "text": "What's in these images?"
@@ -303,10 +301,8 @@ async def test_parse_chat_messages_multiple_images_async(
                     "url": image_url
                 }
             }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
+                "type": "image_pil",
+                "image_pil": ImageAsset('cherry_blossom').pil_image
             }, {
                 "type": "text",
                 "text": "What's in these images?"
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 1054b969cd3b8..4b6c50526b105 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -28,7 +28,8 @@ from openai.types.chat import (ChatCompletionMessageToolCallParam,
                                ChatCompletionToolMessageParam)
 from openai.types.chat.chat_completion_content_part_input_audio_param import (
     InputAudio)
-from pydantic import TypeAdapter
+from PIL import Image
+from pydantic import BaseModel, ConfigDict, TypeAdapter
 # yapf: enable
 from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast,
                           ProcessorMixin)
@@ -91,6 +92,25 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False):
     """The type of the content part."""
 
 
+class PILImage(BaseModel):
+    """
+    A PIL.Image.Image object.
+    """
+    image_pil: Image.Image
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+
+class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
+    """A simpler version of the param that only accepts a PIL image.
+
+    Example:
+    {
+        "image_pil": ImageAsset('cherry_blossom').pil_image
+    }
+    """
+    image_pil: Required[PILImage]
+
+
 class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
     """A simpler version of the param that only accepts a plain image_url.
     This is supported by OpenAI API, although it is not documented.
@@ -129,6 +149,7 @@ ChatCompletionContentPartParam: TypeAlias = Union[
     OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
     ChatCompletionContentPartInputAudioParam,
     ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
+    CustomChatCompletionContentPILImageParam,
     CustomChatCompletionContentSimpleImageParam,
     ChatCompletionContentPartImageEmbedsParam,
     CustomChatCompletionContentSimpleAudioParam,
@@ -631,6 +652,10 @@ class BaseMultiModalContentParser(ABC):
                            image_embeds: Union[str, dict[str, str]]) -> None:
         raise NotImplementedError
 
+    @abstractmethod
+    def parse_image_pil(self, image_pil: Image.Image) -> None:
+        raise NotImplementedError
+
     @abstractmethod
     def parse_audio(self, audio_url: str) -> None:
         raise NotImplementedError
@@ -677,6 +702,10 @@ class MultiModalContentParser(BaseMultiModalContentParser):
 
         self._add_placeholder(placeholder)
 
+    def parse_image_pil(self, image_pil: Image.Image) -> None:
+        placeholder = self._tracker.add("image", image_pil)
+        self._add_placeholder(placeholder)
+
     def parse_audio(self, audio_url: str) -> None:
         audio = self._connector.fetch_audio(audio_url)
 
@@ -733,6 +762,13 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
         placeholder = self._tracker.add("image_embeds", future)
         self._add_placeholder(placeholder)
 
+    def parse_image_pil(self, image_pil: Image.Image) -> None:
+        future: asyncio.Future[Image.Image] = asyncio.Future()
+        future.set_result(image_pil)
+
+        placeholder = self._tracker.add("image", future)
+        self._add_placeholder(placeholder)
+
     def parse_audio(self, audio_url: str) -> None:
         audio_coro = self._connector.fetch_audio_async(audio_url)
 
@@ -851,12 +887,13 @@ _TextParser = partial(cast, ChatCompletionContentPartTextParam)
 _ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
 _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
+_PILImageParser = partial(cast, CustomChatCompletionContentPILImageParam)
 # Need to validate url objects
 _ImageParser = TypeAdapter(ChatCompletionContentPartImageParam).validate_python
 _AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python
 _VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python
 
-_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio]
+_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio, PILImage]
 
 # Define a mapping from part types to their corresponding parsing functions.
 MM_PARSER_MAP: dict[
@@ -869,6 +906,7 @@ MM_PARSER_MAP: dict[
     lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
     "image_embeds":
     lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
+    "image_pil": lambda part: _PILImageParser(part).get("image_pil", None),
     "audio_url":
     lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
     "input_audio":
@@ -938,7 +976,7 @@ def _parse_chat_message_content_mm_part(
 
 
 VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
-                                       "image_embeds",
+                                       "image_embeds", "image_pil",
                                        "audio_url", "input_audio", "video_url")
 
 
@@ -1009,6 +1047,10 @@ def _parse_chat_message_content_part(
         else:
             return str_content
 
+    if part_type == "image_pil":
+        image_content = cast(Image.Image, content)
+        mm_parser.parse_image_pil(image_content)
+        return {'type': 'image'} if wrap_dicts else None
     if part_type == "image_url":
         str_content = cast(str, content)
         mm_parser.parse_image(str_content)

From cede942b87b5d8baa0b95447f3e87e3c600ff5f5 Mon Sep 17 00:00:00 2001
From: Brayden Zhong <b8zhong@uwaterloo.ca>
Date: Sun, 6 Jul 2025 05:20:11 -0400
Subject: [PATCH 115/167] [Benchmark] Add support for multiple batch size
 benchmark through CLI in `benchmark_moe.py` (#20516)

Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
---
 benchmarks/kernels/benchmark_moe.py           |   4 +-
 ...evice_name=NVIDIA_B200,dtype=fp8_w8a8.json | 147 ++++++++++++++++++
 2 files changed, 149 insertions(+), 2 deletions(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index cef53b183cef3..07af58d81c683 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -620,7 +620,7 @@ def main(args: argparse.Namespace):
             4096,
         ]
     else:
-        batch_sizes = [args.batch_size]
+        batch_sizes = args.batch_size
 
     use_deep_gemm = bool(args.use_deep_gemm)
 
@@ -728,7 +728,7 @@ if __name__ == "__main__":
     )
     parser.add_argument("--use-deep-gemm", action="store_true")
     parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--batch-size", type=int, required=False)
+    parser.add_argument("--batch-size", type=int, nargs="+", required=False)
     parser.add_argument("--tune", action="store_true")
     parser.add_argument("--trust-remote-code", action="store_true")
     parser.add_argument("--model-prefix", type=str, required=False)
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..beeb5a6b2cafc
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
@@ -0,0 +1,147 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
+

From 43813e6361f3c0fd411f12ece9eab693e29860ff Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sun, 6 Jul 2025 18:25:29 +0800
Subject: [PATCH 116/167] [Misc] call the pre-defined func (#20518)

Signed-off-by: reidliu41 <reid201711@gmail.com>
---
 .../entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py | 4 ++--
 vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py      | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
index 60025af2a6f33..da4760ad1b642 100644
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
@@ -6,6 +6,7 @@ from typing import Union
 
 import regex as re
 
+from vllm.entrypoints.chat_utils import random_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaFunctionCall, DeltaMessage,
                                               DeltaToolCall,
@@ -15,7 +16,6 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser, ToolParserManager)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
 
@@ -267,7 +267,7 @@ class DeepSeekV3ToolParser(ToolParser):
                         DeltaToolCall(
                             index=self.current_tool_id,
                             type="function",
-                            id=f"chatcmpl-tool-{random_uuid()}",
+                            id=random_tool_call_id(),
                             function=DeltaFunctionCall(
                                 name=function_name).model_dump(
                                     exclude_none=True),
diff --git a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
index 9392e4f0e1dc2..321718b1c950b 100644
--- a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
@@ -7,6 +7,7 @@ from typing import Any, Optional, Union
 
 import regex as re
 
+from vllm.entrypoints.chat_utils import random_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaFunctionCall, DeltaMessage,
                                               DeltaToolCall,
@@ -225,7 +226,7 @@ class xLAMToolParser(ToolParser):
                         function_name = name_match.group(1)
 
                         # The test expects us to send just the name first
-                        tool_id = f"chatcmpl-tool-{random_uuid()}"
+                        tool_id = random_tool_call_id()
                         delta = DeltaMessage(tool_calls=[
                             DeltaToolCall(
                                 index=0,

From e202dd2736bc575b11250b15311512d19d3225d5 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 6 Jul 2025 08:48:13 -0700
Subject: [PATCH 117/167] [V0 deprecation] Remove V0 CPU/XPU/TPU backends
 (#20412)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: jiang1.li <jiang1.li@intel.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
---
 .../scripts/hardware_ci/run-cpu-test.sh       |   8 +-
 .../scripts/hardware_ci/run-xpu-test.sh       |   2 -
 .../online_serving/chart-helm/values.yaml     |   2 +-
 .../attention/test_attention_selector.py      |  17 +-
 vllm/attention/backends/cpu_mla.py            | 307 ------
 vllm/attention/backends/ipex_attn.py          | 403 --------
 vllm/attention/backends/pallas.py             | 356 -------
 vllm/attention/backends/torch_sdpa.py         | 167 +---
 vllm/platforms/cpu.py                         |  26 +-
 vllm/platforms/tpu.py                         |  51 +-
 vllm/platforms/xpu.py                         |  21 +-
 vllm/worker/cpu_enc_dec_model_runner.py       | 326 -------
 vllm/worker/cpu_model_runner.py               | 671 -------------
 vllm/worker/cpu_pooling_model_runner.py       | 125 ---
 vllm/worker/cpu_worker.py                     | 452 ---------
 vllm/worker/multi_step_tpu_worker.py          | 108 ---
 vllm/worker/tpu_model_runner.py               | 909 ------------------
 vllm/worker/tpu_worker.py                     | 337 -------
 vllm/worker/xpu_model_runner.py               | 606 ------------
 vllm/worker/xpu_worker.py                     | 186 ----
 20 files changed, 46 insertions(+), 5034 deletions(-)
 delete mode 100644 vllm/attention/backends/cpu_mla.py
 delete mode 100644 vllm/attention/backends/ipex_attn.py
 delete mode 100644 vllm/attention/backends/pallas.py
 delete mode 100644 vllm/worker/cpu_enc_dec_model_runner.py
 delete mode 100644 vllm/worker/cpu_model_runner.py
 delete mode 100644 vllm/worker/cpu_pooling_model_runner.py
 delete mode 100644 vllm/worker/cpu_worker.py
 delete mode 100644 vllm/worker/multi_step_tpu_worker.py
 delete mode 100644 vllm/worker/tpu_model_runner.py
 delete mode 100644 vllm/worker/tpu_worker.py
 delete mode 100644 vllm/worker/xpu_model_runner.py
 delete mode 100644 vllm/worker/xpu_worker.py

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 42506730e868c..737b2eede9c68 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -66,10 +66,10 @@ function cpu_tests() {
     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
 
   # Run AWQ test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    VLLM_USE_V1=0 pytest -s -v \
-    tests/quantization/test_ipex_quant.py"
+  # docker exec cpu-test-"$NUMA_NODE" bash -c "
+  #   set -e
+  #   VLLM_USE_V1=0 pytest -s -v \
+  #   tests/quantization/test_ipex_quant.py"
 
   # Run chunked-prefill and prefix-cache test
   docker exec cpu-test-"$NUMA_NODE" bash -c "
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 827649bfcf548..cf3aaab8493ba 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -26,7 +26,5 @@ docker run \
     --name "${container_name}" \
     "${image_name}" \
     sh -c '
-    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
-    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
 '
diff --git a/examples/online_serving/chart-helm/values.yaml b/examples/online_serving/chart-helm/values.yaml
index 28dba9a6f6882..815f02a4bfd52 100644
--- a/examples/online_serving/chart-helm/values.yaml
+++ b/examples/online_serving/chart-helm/values.yaml
@@ -8,7 +8,7 @@ image:
   # -- Image tag
   tag: "latest"
   # -- Container launch command
-  command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "float32", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]
+  command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--enforce-eager", "--dtype", "bfloat16", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]
 
 # -- Container port
 containerPort: 8000
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index 0437bb8293cea..7d7522c1fc00c 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -36,7 +36,8 @@ DEVICE_REGULAR_ATTN_BACKENDS = {
 DEVICE_MLA_BLOCK_SIZES = {
     "cuda": [16, 64],  # CUDA supports both standard and extended block sizes
     "hip": [16, 1],  # HIP requires special handling for block_size=1
-    "cpu": [16]  # CPU uses fixed block size from test cases
+    # "cpu": [16]  # CPU uses fixed block size from test cases
+    "cpu": []  # FIXME(woosuk): Temporarily disable CPU tests
 }
 
 
@@ -81,14 +82,14 @@ def test_env(
         m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
 
         if device == "cpu":
+            if not use_v1:
+                pytest.skip("CPU backend only supports V1")
+
             with patch("vllm.attention.selector.current_platform",
                        CpuPlatform()):
                 backend = get_attn_backend(16, torch.float16, torch.float16,
                                            block_size, False)
-            if use_v1:
-                assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
-            else:
-                assert backend.get_name() == "TORCH_SDPA"
+            assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
 
         elif device == "hip":
             with patch("vllm.attention.selector.current_platform",
@@ -193,12 +194,14 @@ def test_fp32_fallback(
         m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
 
         if device == "cpu":
+            if not use_v1:
+                pytest.skip("CPU backend only supports V1")
+
             with patch("vllm.attention.selector.current_platform",
                        CpuPlatform()):
                 backend = get_attn_backend(16, torch.float32, torch.float32,
                                            16, False)
-            assert (backend.get_name() == "TORCH_SDPA_VLLM_V1"
-                    if use_v1 else "TORCH_SDPA")
+            assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
 
         elif device == "cuda":
             with patch("vllm.attention.selector.current_platform",
diff --git a/vllm/attention/backends/cpu_mla.py b/vllm/attention/backends/cpu_mla.py
deleted file mode 100644
index 793cb87b74342..0000000000000
--- a/vllm/attention/backends/cpu_mla.py
+++ /dev/null
@@ -1,307 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Type
-
-import torch
-
-import vllm._custom_ops as ops
-from vllm._ipex_ops import ipex_ops
-from vllm.attention.backends.abstract import (AttentionBackend,
-                                              AttentionMetadataBuilder,
-                                              AttentionType,
-                                              is_quantized_kv_cache)
-from vllm.attention.backends.mla.common import MLACommonImpl, MLACommonState
-from vllm.attention.backends.torch_sdpa import TorchSDPAMetadata
-from vllm.utils import make_tensor_with_pad
-from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder
-
-
-class CPUMLABackend(AttentionBackend):
-
-    @staticmethod
-    def get_name() -> str:
-        return "CPU_MLA"
-
-    @staticmethod
-    def get_metadata_cls() -> Type["CPUMLAMetadata"]:
-        return CPUMLAMetadata
-
-    @staticmethod
-    def get_builder_cls() -> Type["CPUMLAMetadataBuilder"]:
-        return CPUMLAMetadataBuilder
-
-    @staticmethod
-    def get_state_cls() -> Type["MLACommonState"]:
-        return MLACommonState
-
-    @staticmethod
-    def get_impl_cls() -> Type["CPUMLAImpl"]:
-        return CPUMLAImpl
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,  # assumed to be 1 for MLA
-        head_size: int,
-    ) -> Tuple[int, ...]:
-        return (num_blocks, block_size, head_size)
-
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: torch.Tensor,
-    ) -> None:
-        ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
-
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[torch.Tensor],
-        src_to_dists: torch.Tensor,
-    ) -> None:
-        ops.copy_blocks_mla(kv_caches, src_to_dists)
-
-    @staticmethod
-    def get_supported_head_sizes() -> List[int]:
-        return [576]
-
-
-@dataclass
-class CPUMLAMetadata(TorchSDPAMetadata):
-    # New for MLA
-    # Input positions for rotrary embeddings since for MLA the rotary
-    # position embeddings are applied inside the attention backend
-    input_positions: torch.Tensor = None
-
-    # required by MLACommonImpl
-    is_profile_run: bool = False
-
-
-class CPUMLAMetadataBuilder(AttentionMetadataBuilder[CPUMLAMetadata]):
-
-    def __init__(self, input_builder: ModelInputForCPUBuilder) -> None:
-        self.chunked_prefill = input_builder.chunked_prefill
-        self.input_builder = input_builder
-        assert not self.chunked_prefill, \
-            "chunked prefill is currently not supported"
-
-    def prepare(self):
-        self.input_data = self.input_builder.input_data
-
-    def build(self, seq_lens, query_lens, cuda_graph_pad_size, batch_size):
-        input_data = self.input_data
-        prefill_seq_lens = seq_lens[0:input_data.num_prefills]
-        prefill_query_lens = query_lens[0:input_data.num_prefills]
-        slot_mapping = torch.tensor(input_data.slot_mapping,
-                                    dtype=torch.long,
-                                    device="cpu")
-
-        # metadata for prefill
-        if input_data.num_prefills > 0:
-            query_lens_tensor = torch.tensor(prefill_query_lens,
-                                             dtype=torch.int32,
-                                             device="cpu")
-            kv_lens_tensor = torch.tensor(prefill_seq_lens,
-                                          dtype=torch.int32,
-                                          device="cpu")
-            query_start_loc = torch.zeros(input_data.num_prefills + 1,
-                                          dtype=torch.int32,
-                                          device="cpu")
-            kv_start_loc = torch.zeros(input_data.num_prefills + 1,
-                                       dtype=torch.int32,
-                                       device="cpu")
-            torch.cumsum(query_lens_tensor,
-                         dim=0,
-                         dtype=torch.int32,
-                         out=query_start_loc[1:])
-            torch.cumsum(kv_lens_tensor,
-                         dim=0,
-                         dtype=torch.int32,
-                         out=kv_start_loc[1:])
-            max_query_len = max(prefill_query_lens)
-            max_kv_len = max(prefill_seq_lens)
-
-            # for chunked-prefill
-            if self.chunked_prefill:
-                prefill_block_tables = make_tensor_with_pad(
-                    self.input_data.prefill_block_tables,
-                    pad=0,
-                    dtype=torch.int32,
-                    device="cpu",
-                )
-            else:
-                prefill_block_tables = None
-
-        else:
-            query_start_loc = None
-            kv_start_loc = None
-            max_query_len = None
-            max_kv_len = None
-            prefill_block_tables = None
-
-        # metadata for decode
-        if input_data.num_decode_tokens != 0:
-            seq_lens_tensor = torch.tensor(
-                input_data.seq_lens[input_data.num_prefills:],
-                dtype=torch.int32,
-                device="cpu",
-            )
-            block_tables = make_tensor_with_pad(
-                self.input_data.decode_block_tables,
-                pad=0,
-                dtype=torch.int32,
-                device="cpu",
-            )
-        else:
-            block_tables = torch.tensor([])
-            seq_lens_tensor = torch.tensor(
-                input_data.seq_lens[:input_data.num_prefills],
-                dtype=torch.int32,
-                device="cpu",
-            )
-
-        # For multi-modal models
-        placeholder_index_maps = None
-        if len(input_data.multi_modal_inputs_list) != 0:
-            placeholder_index_maps = {
-                modality: placeholder_map.index_map()
-                for modality, placeholder_map in
-                input_data.multi_modal_placeholder_maps.items()
-            }
-
-        return CPUMLAMetadata(
-            chunked_prefill=self.chunked_prefill,
-            seq_lens=prefill_seq_lens,
-            seq_lens_tensor=seq_lens_tensor,
-            max_query_len=max_query_len,
-            max_kv_len=max_kv_len,
-            prefill_query_start_loc=query_start_loc,
-            kv_start_loc=kv_start_loc,
-            max_decode_seq_len=input_data.max_decode_seq_len,
-            num_prefills=input_data.num_prefills,
-            num_prefill_tokens=input_data.num_prefill_tokens,
-            num_decode_tokens=input_data.num_decode_tokens,
-            block_tables=block_tables,
-            prefill_block_tables=prefill_block_tables,
-            slot_mapping=slot_mapping,
-            multi_modal_placeholder_index_maps=placeholder_index_maps,
-            enable_kv_scales_calculation=False,
-            input_positions=torch.tensor([self.input_data.input_positions]))
-
-
-class CPUMLAImpl(MLACommonImpl[CPUMLAMetadata]):
-
-    def __init__(
-            self,
-            num_heads: int,
-            head_size: int,
-            scale: float,
-            num_kv_heads: int,
-            alibi_slopes: Optional[List[float]],
-            sliding_window: Optional[int],
-            kv_cache_dtype: str,
-            blocksparse_params: Optional[Dict[str, Any]],
-            logits_soft_cap: Optional[float],
-            attn_type: str,
-            kv_sharing_target_layer_name: Optional[str],
-            # MLA Specific Arguments
-            **mla_args) -> None:
-        super().__init__(num_heads, head_size, scale, num_kv_heads,
-                         alibi_slopes, sliding_window, kv_cache_dtype,
-                         blocksparse_params, logits_soft_cap, attn_type,
-                         kv_sharing_target_layer_name, **mla_args)
-
-        unsupported_features = [
-            alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
-        ]
-        if any(unsupported_features):
-            raise NotImplementedError(
-                "CPUMLAImpl does not support one of the following: "
-                "alibi_slopes, sliding_window, blocksparse_params, "
-                "logits_soft_cap")
-
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "CPUMLAImpl")
-
-        # states is implemented.
-        if is_quantized_kv_cache(self.kv_cache_dtype):
-            raise NotImplementedError(
-                "CPUMLAImpl with FP8 KV cache not yet supported")
-
-    def _forward_prefill(
-            self,
-            q: torch.Tensor,
-            kv_c_normed: torch.Tensor,
-            k_pe: torch.Tensor,
-            kv_c_and_k_pe_cache: torch.Tensor,
-            attn_metadata: CPUMLAMetadata,  # type: ignore[override]
-    ) -> torch.Tensor:
-
-        prefill_metadata = attn_metadata.prefill_metadata
-        assert prefill_metadata is not None
-
-        kv_nope = self.kv_b_proj(kv_c_normed)[0].view(\
-            -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
-        k_nope, v = kv_nope\
-            .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
-
-        k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
-
-        # For MLA the v head dim is smaller than qk head dim so we pad out
-        # v with 0s to match the qk head dim
-        v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
-                                           value=0)
-
-        output = torch.empty_like(q)
-        ipex_ops.varlen_attention(
-            query=q,
-            key=k,
-            value=v_padded,
-            out=output,
-            seqlen_q=prefill_metadata.prefill_query_start_loc,
-            seqlen_k=prefill_metadata.prefill_query_start_loc,
-            max_seqlen_q=prefill_metadata.max_query_len,
-            max_seqlen_k=prefill_metadata.max_query_len,
-            pdropout=0.0,
-            softmax_scale=self.scale,
-            zero_tensors=False,
-            is_causal=True,
-            return_softmax=False,
-            gen_=None,
-            logits_soft_cap=0.0,
-            window_size_left=-1,
-            window_size_right=-1,
-            alibi_slopes=None,
-        )
-
-        # remove padding
-        output = output.view(-1, self.num_heads,
-                             q.shape[-1])[..., :v.shape[-1]]
-        return output.reshape(-1, self.num_heads * v.shape[-1])
-
-    def _forward_decode(
-            self,
-            q_nope: torch.Tensor,
-            q_pe: torch.Tensor,
-            kv_c_and_k_pe_cache: torch.Tensor,
-            attn_metadata: CPUMLAMetadata,  # type: ignore[override]
-    ) -> torch.Tensor:
-        assert kv_c_and_k_pe_cache.numel() > 0
-
-        decode_meta = attn_metadata.decode_metadata
-        assert decode_meta is not None
-
-        q = torch.cat([q_nope, q_pe], dim=-1)
-        o = q.new_empty(q.shape[0], self.num_heads, self.kv_lora_rank)
-
-        # Run MQA
-        ops.mla_decode_kvcache_cpu(o, q, kv_c_and_k_pe_cache, self.scale,
-                                   decode_meta.block_tables,
-                                   decode_meta.seq_lens_tensor)
-        return self._v_up_proj(o)
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
deleted file mode 100644
index 410ada3b0828b..0000000000000
--- a/vllm/attention/backends/ipex_attn.py
+++ /dev/null
@@ -1,403 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-""" Attention layer with torch scaled_dot_product_attention
-    and PagedAttention."""
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Type
-
-import torch
-
-from vllm._ipex_ops import ipex_ops
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionLayer,
-                                              AttentionMetadata, AttentionType,
-                                              is_quantized_kv_cache)
-from vllm.attention.backends.utils import CommonAttentionState
-from vllm.attention.ops.paged_attn import (PagedAttention,
-                                           PagedAttentionMetadata)
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-_PARTITION_SIZE = 512
-
-
-class IpexAttnBackend(AttentionBackend):
-
-    @staticmethod
-    def get_name() -> str:
-        return "IPEX"
-
-    @staticmethod
-    def get_impl_cls() -> Type["IpexAttnBackendImpl"]:
-        return IpexAttnBackendImpl
-
-    @staticmethod
-    def get_metadata_cls() -> Type["IpexAttnMetadata"]:
-        return IpexAttnMetadata
-
-    @staticmethod
-    def get_state_cls() -> Type["CommonAttentionState"]:
-        return CommonAttentionState
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> Tuple[int, ...]:
-        return PagedAttention.get_kv_cache_shape(num_blocks, block_size,
-                                                 num_kv_heads, head_size)
-
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: torch.Tensor,
-    ) -> None:
-        from vllm._ipex_ops import ipex_ops as ops
-        ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
-
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[torch.Tensor],
-        src_to_dists: torch.Tensor,
-    ) -> None:
-        from vllm._ipex_ops import ipex_ops as ops
-        key_caches = [kv_cache[0] for kv_cache in kv_caches]
-        value_caches = [kv_cache[1] for kv_cache in kv_caches]
-        ops.copy_blocks(key_caches, value_caches, src_to_dists)
-
-
-@dataclass
-class IpexAttnMetadata(AttentionMetadata, PagedAttentionMetadata):
-    """Metadata for IpexAttnBackend.
-    """
-    # Currently, input sequences can only contain all prompts
-    # or all decoding. True if all sequences are prompts.
-    is_prompt: bool
-    slot_mapping: torch.Tensor
-    seq_lens: Optional[List[int]]
-    seqlen_q: Optional[torch.Tensor]
-    max_seqlen: Optional[int]
-
-    def __post_init__(self):
-        # Set during the execution of the first attention op.
-        # It is a list because it is needed to set per prompt
-        # when alibi slopes is used. It is because of the limitation
-        # from xformer API.
-        # will not appear in the __repr__ and __init__
-        self.attn_bias: Optional[List[torch.Tensor]] = None
-
-    @property
-    def prefill_metadata(self) -> Optional["IpexAttnMetadata"]:
-        # Currently chunked prefill is not supported
-        if self.num_decode_tokens == 0:
-            assert self.num_prefills > 0
-            return self
-
-        return None
-
-    @property
-    def decode_metadata(self) -> Optional["IpexAttnMetadata"]:
-        # Currently chunked prefill is not supported
-        if self.num_prefills > 0:
-            assert self.num_decode_tokens == 0
-            return None
-
-        return self
-
-
-class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        blocksparse_params: Optional[Dict[str, Any]] = None,
-        logits_soft_cap: Optional[float] = None,
-        attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        use_irope: bool = False,
-    ) -> None:
-        if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0.")
-        if use_irope:
-            logger.warning_once(
-                "Using irope in Ipex is not supported yet, it will fall"
-                " back to global attention for long context.")
-        if blocksparse_params is not None:
-            raise ValueError(
-                "IPEX backend does not support block-sparse attention.")
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.num_kv_heads = num_kv_heads
-        if alibi_slopes is not None:
-            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
-        self.alibi_slopes = alibi_slopes
-        self.sliding_window = sliding_window
-        self.kv_cache_dtype = kv_cache_dtype
-
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-        self.need_mask = (self.sliding_window is not None)
-        if logits_soft_cap is None:
-            logits_soft_cap = -1
-        self.logits_soft_cap = logits_soft_cap
-
-        supported_head_sizes = PagedAttention.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            raise ValueError(
-                f"Head size {head_size} is not supported by PagedAttention. "
-                f"Supported head sizes are: {supported_head_sizes}.")
-        if is_quantized_kv_cache(kv_cache_dtype):
-            raise NotImplementedError(
-                "IPEX backend does not support FP8 KV cache. "
-                "Please use xFormers backend instead.")
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "IpexAttnBackendImpl")
-
-    def split_kv_cache(
-        self,
-        kv_cache: torch.Tensor,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        x = 1
-        num_blocks = kv_cache.shape[1]
-
-        key_cache = kv_cache[0]
-        key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x,
-                                   -1, x)
-        value_cache = kv_cache[1]
-        value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1)
-        return key_cache, value_cache
-
-    def forward(
-        self,
-        layer: AttentionLayer,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: IpexAttnMetadata,  # type: ignore
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """Forward pass with IPEX varlen_attention and PagedAttention.
-
-        Args:
-            query: shape = [num_tokens, num_heads * head_size]
-            key: shape = [num_tokens, num_kv_heads * head_size]
-            value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
-                NOTE: kv_cache will be an empty tensor with shape [0]
-                for profiling run.
-            attn_metadata: Metadata for attention.
-        Returns:
-            shape = [num_tokens, num_heads * head_size]
-        """
-        if output_scale is not None:
-            raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for IpexAttentionImpl")
-
-        assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0
-        num_tokens, hidden_size = query.shape
-        # Reshape the query, key, and value tensors.
-        query = query.view(-1, self.num_heads, self.head_size)
-        key = key.view(-1, self.num_kv_heads, self.head_size)
-        value = value.view(-1, self.num_kv_heads, self.head_size)
-
-        if kv_cache.numel() > 0:
-            key_cache, value_cache = self.split_kv_cache(
-                kv_cache, self.num_kv_heads, self.head_size)
-            ipex_ops.reshape_and_cache(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                attn_metadata.slot_mapping.flatten(),
-                self.kv_cache_dtype,
-                layer._k_scale_float,
-                layer._v_scale_float,
-            )
-
-        if attn_metadata.is_prompt:
-            assert attn_metadata.seq_lens is not None
-            if (kv_cache.numel() == 0
-                    or attn_metadata.block_tables.numel() == 0):
-                if self.num_kv_heads != self.num_heads:
-                    key = key.repeat_interleave(self.num_queries_per_kv, dim=1)
-                    value = value.repeat_interleave(self.num_queries_per_kv,
-                                                    dim=1)
-
-                if attn_metadata.attn_bias is None:
-                    if self.sliding_window is not None:
-                        att_masks = _make_sliding_window_bias(
-                            attn_metadata.seq_lens, self.sliding_window,
-                            query.dtype)  # type: ignore
-                    else:
-                        att_masks = _make_sliding_window_bias(
-                            attn_metadata.seq_lens, None, dtype=query.dtype)
-                    attn_metadata.attn_bias = att_masks
-
-                output = torch.empty(
-                    (num_tokens, self.num_heads, self.head_size),
-                    dtype=query.dtype,
-                    device=query.device)
-                ipex_ops.varlen_attention(
-                    query,
-                    key,
-                    value,
-                    output,
-                    attn_metadata.seqlen_q,
-                    attn_metadata.seqlen_q,
-                    self.alibi_slopes,
-                    attn_metadata.max_seqlen,
-                    attn_metadata.max_seqlen,
-                    pdropout=0.0,
-                    softmax_scale=self.scale,
-                    zero_tensors=False,
-                    is_causal=True,
-                    return_softmax=False,
-                    gen_=None,
-                    window_size_left=-1,
-                    window_size_right=-1,
-                    logits_soft_cap=self.logits_soft_cap,
-                )
-            else:
-                # prefix-enabled attention
-                raise RuntimeError(
-                    "IPEX backend doesn't support prefix decoding.")
-
-        else:
-            # Decoding run.
-            max_seq_len = attn_metadata.max_decode_seq_len
-            output = torch.empty_like(query)
-            block_size = value_cache.shape[3]
-            num_seqs, num_heads, head_size = query.shape
-            max_num_partitions = ((max_seq_len + _PARTITION_SIZE - 1) //
-                                  _PARTITION_SIZE)
-            # NOTE(woosuk): We use a simple heuristic to decide whether to use
-            # PagedAttention V1 or V2. If the number of partitions is 1, we use
-            # V1 to avoid the overhead of reduction. Also, if the number of
-            # sequences or heads is large, we use V1 since there is enough work
-            # to parallelize.
-            # TODO(woosuk): Tune this heuristic.
-            # For context len > 8192, use V2 kernel to avoid shared memory
-            # shortage.
-            use_v1 = (max_seq_len <= 8192 and
-                      (max_num_partitions == 1 or num_seqs * num_heads > 512))
-            if use_v1:
-                # Run PagedAttention V1.
-                ipex_ops.paged_attention_v1(
-                    output,
-                    query,
-                    key_cache,
-                    value_cache,
-                    self.num_kv_heads,
-                    self.scale,
-                    attn_metadata.block_tables,
-                    attn_metadata.seq_lens_tensor,
-                    block_size,
-                    max_seq_len,
-                    self.alibi_slopes,
-                    self.kv_cache_dtype,
-                    layer._k_scale_float,
-                    layer._v_scale_float,
-                )
-            else:
-                # Run PagedAttention V2.
-                assert _PARTITION_SIZE % block_size == 0
-                tmp_output = torch.empty(
-                    size=(num_seqs, num_heads, max_num_partitions, head_size),
-                    dtype=output.dtype,
-                    device=output.device,
-                )
-                exp_sums = torch.empty(
-                    size=(num_seqs, num_heads, max_num_partitions),
-                    dtype=torch.float32,
-                    device=output.device,
-                )
-                max_logits = torch.empty_like(exp_sums)
-                ipex_ops.paged_attention_v2(
-                    output,
-                    exp_sums,
-                    max_logits,
-                    tmp_output,
-                    query,
-                    key_cache,
-                    value_cache,
-                    self.num_kv_heads,
-                    self.scale,
-                    attn_metadata.block_tables,
-                    attn_metadata.seq_lens_tensor,
-                    block_size,
-                    max_seq_len,
-                    self.alibi_slopes,
-                    self.kv_cache_dtype,
-                    layer._k_scale_float,
-                    layer._v_scale_float,
-                )
-
-            # Reshape the output tensor.
-        return output.view(-1, self.num_heads * self.head_size)
-
-
-def _make_alibi_bias(
-    alibi_slopes: torch.Tensor,
-    dtype: torch.dtype,
-    seq_lens: List[int],
-) -> List[torch.Tensor]:
-    attn_biases = []
-    for seq_len in seq_lens:
-        bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device)
-        # NOTE(zhuohan): HF uses
-        #     `bias = bias[None, :].repeat(seq_len, 1)`
-        # here. We find that both biases give the same results, but
-        # the bias below more accurately follows the original ALiBi
-        # paper.
-        bias = bias[None, :] - bias[:, None]
-
-        num_heads = alibi_slopes.shape[0]
-        bias = bias[None, :].repeat((num_heads, 1, 1))
-        bias.mul_(alibi_slopes[:, None, None])
-        inf_mask = torch.empty(
-            (1, seq_len, seq_len),
-            dtype=bias.dtype,
-            device=alibi_slopes.device).fill_(-torch.inf).triu_(diagonal=1)
-        attn_biases.append((bias + inf_mask).to(dtype))
-
-    return attn_biases
-
-
-def _make_sliding_window_bias(
-    seq_lens: List[int],
-    window_size: Optional[int],
-    dtype: torch.dtype,
-) -> List[torch.Tensor]:
-    attn_biases = []
-    for seq_len in seq_lens:
-        tensor = torch.full(
-            (1, seq_len, seq_len),
-            dtype=dtype,
-            fill_value=1,
-        )
-        shift = 0
-        mask = torch.tril(tensor, diagonal=shift).to(dtype)  # type: ignore
-        if window_size is not None:
-            mask = torch.triu(mask, diagonal=shift - window_size + 1)
-        mask = torch.log(mask)
-        attn_biases.append(mask.to(dtype))
-
-    return attn_biases
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
deleted file mode 100644
index c900666955a32..0000000000000
--- a/vllm/attention/backends/pallas.py
+++ /dev/null
@@ -1,356 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Type
-
-import torch
-import torch_xla.experimental.custom_kernel  # Required to register custom ops.
-
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionLayer,
-                                              AttentionMetadata, AttentionType,
-                                              is_quantized_kv_cache)
-from vllm.attention.backends.utils import CommonAttentionState
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-class PallasAttentionBackend(AttentionBackend):
-
-    @staticmethod
-    def get_name() -> str:
-        return "PALLAS"
-
-    @staticmethod
-    def get_impl_cls() -> Type["PallasAttentionBackendImpl"]:
-        return PallasAttentionBackendImpl
-
-    @staticmethod
-    def get_metadata_cls() -> Type["PallasMetadata"]:
-        return PallasMetadata
-
-    @staticmethod
-    def get_state_cls() -> Type["CommonAttentionState"]:
-        return CommonAttentionState
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> Tuple[int, ...]:
-        return (num_kv_heads, num_blocks, block_size, head_size)
-
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: torch.Tensor,
-    ) -> None:
-        raise RuntimeError("swap_blocks is not used for the TPU backend.")
-
-    @torch.compile(backend="openxla")
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
-        src_to_dists: Tuple[torch.Tensor, torch.Tensor],
-    ) -> None:
-        src_indices, dst_indices = src_to_dists
-        for k_cache, v_cache in kv_caches:
-            torch.ops.xla.dynamo_set_buffer_donor_(k_cache, True)
-            k_cache[:, dst_indices] = k_cache[:, src_indices]
-            torch.ops.xla.dynamo_set_buffer_donor_(v_cache, True)
-            v_cache[:, dst_indices] = v_cache[:, src_indices]
-
-
-@dataclass
-class PallasMetadata(AttentionMetadata):
-
-    # Currently, input sequences can only contain all prefills
-    # or all decoding.
-    block_tables: Optional[torch.Tensor] = None
-    context_lens: Optional[torch.Tensor] = None
-    effective_query_lens: Optional[torch.Tensor] = None
-
-    @property
-    def prefill_metadata(self) -> Optional["PallasMetadata"]:
-        if self.num_prefills == 0:
-            return None
-
-        assert self.num_decode_tokens == 0
-        return self
-
-    @property
-    def decode_metadata(self) -> Optional["PallasMetadata"]:
-        if self.num_decode_tokens == 0:
-            return None
-
-        assert self.num_prefills == 0
-        assert self.num_prefill_tokens == 0
-        assert self.block_tables is not None
-        assert self.context_lens is not None
-        return self
-
-
-class PallasAttentionBackendImpl(AttentionImpl):
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        blocksparse_params: Optional[Dict[str, Any]] = None,
-        logits_soft_cap: Optional[float] = None,
-        attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        use_irope: bool = False,
-    ) -> None:
-        if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0.")
-        if use_irope:
-            logger.warning_once(
-                "Using irope in Pallas is not supported yet, it will fall back "
-                "to global attention for long context.")
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.num_kv_heads = num_kv_heads
-
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-        self.logits_soft_cap = logits_soft_cap
-        if head_size % 128 != 0:
-            raise NotImplementedError(
-                f"Head size must be a multiple of 128, found {head_size}.")
-        if alibi_slopes is not None:
-            raise NotImplementedError("Alibi slopes is not supported.")
-        if sliding_window is not None:
-            raise NotImplementedError("Sliding window is not supported.")
-        if is_quantized_kv_cache(kv_cache_dtype):
-            raise NotImplementedError("FP8 KV cache dtype is not supported.")
-        if blocksparse_params is not None:
-            raise NotImplementedError("Blocksparse is not supported.")
-
-        if torch_xla.tpu.version() < 4:
-            raise NotImplementedError("TPU version must be 4 or higher.")
-
-        self.megacore_mode = None
-        tpu_env = torch_xla.tpu.get_tpu_env()
-        tpu_type = (tpu_env.get("ACCELERATOR_TYPE", None)
-                    or tpu_env.get("TYPE", None)
-                    or tpu_env.get("TPU_ACCELERATOR_TYPE", None))
-        assert tpu_type is not None
-        tpu_type = tpu_type.lower()
-
-        if (("lite" not in tpu_type) and ("v6" not in tpu_type)):
-            if self.num_kv_heads % 2 == 0:
-                self.megacore_mode = "kv_head"
-            else:
-                # NOTE(woosuk): If the batch size is not a multiple of 2, the
-                # megacore mode will be None.
-                self.megacore_mode = "batch"
-
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "PallasAttentionBackendImpl")
-
-    def forward(
-        self,
-        layer: AttentionLayer,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        kv_cache: Tuple[torch.Tensor, torch.Tensor],
-        attn_metadata: PallasMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """Forward pass with Pallas attention.
-
-        Args:
-            query: shape = [batch_size, seq_len, num_heads * head_size]
-            key: shape = [batch_size, seq_len, num_kv_heads * head_size]
-            value: shape = [batch_size, seq_len, num_kv_heads * head_size]
-            kv_cache[0] = [num_kv_heads, num_blocks, block_size, head_size]
-            kv_cache[1] = [num_kv_heads, num_blocks, block_size, head_size]
-                NOTE: kv_cache[0] and kv_cache[1] will be an empty tensor 
-                with shape [0] for profiling run.
-            attn_metadata: Metadata for attention.
-        Returns:
-            shape = [batch_size, seq_len, num_heads * head_size]
-        """
-        if output_scale is not None:
-            raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for PallasAttentionImpl")
-
-        assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0
-        batch_size, seq_len, hidden_size = query.shape
-        query = query.view(batch_size, seq_len, self.num_heads, self.head_size)
-        key = key.view(batch_size, seq_len, self.num_kv_heads, self.head_size)
-        value = value.view(batch_size, seq_len, self.num_kv_heads,
-                           self.head_size)
-
-        if kv_cache[0].numel() > 0:
-            slot_mapping = attn_metadata.slot_mapping
-            key_cache, value_cache = kv_cache
-            write_to_kv_cache(key, value, key_cache, value_cache, slot_mapping)
-
-        query = query * self.scale
-        if attn_metadata.num_prefills > 0:
-            if attn_metadata.block_tables is None:
-                # Prefill without paged KV cache.
-                assert seq_len % 16 == 0, (
-                    "Pallas FlashAttention kernel requires seq_len to be a "
-                    f"multiple of 16 but got {seq_len}")
-
-                # Handle GQA/MQA.
-                if self.num_kv_heads != self.num_heads:
-                    key = key.repeat_interleave(self.num_queries_per_kv,
-                                                dim=-2)
-                    key = key.view(batch_size, seq_len, self.num_heads,
-                                   self.head_size)
-                    value = value.repeat_interleave(self.num_queries_per_kv,
-                                                    dim=-2)
-                    value = value.view(batch_size, seq_len, self.num_heads,
-                                       self.head_size)
-                # FlashAttention kernel requires the input shape to be
-                # [batch_size, num_heads, seq_len, d_model]
-                # while the input is [batch_size, seq_len, num_heads, d_model].
-                # Permute the input to match the required format.
-                output = torch.ops.xla.flash_attention(
-                    query.permute(0, 2, 1, 3),
-                    key.permute(0, 2, 1, 3),
-                    value.permute(0, 2, 1, 3),
-                    True,
-                )
-                output = output.permute(0, 2, 1, 3)
-            else:
-                # Prefill with paged KV cache.
-                # TODO(woosuk): Tune the below knobs.
-                num_kv_pages_per_compute_block = 16
-                num_queries_per_compute_block = 16
-                assert seq_len % num_queries_per_compute_block == 0
-                output = torch.ops.xla.multi_queries_paged_attention(
-                    query,
-                    key_cache,
-                    value_cache,
-                    attn_metadata.context_lens,
-                    attn_metadata.block_tables,
-                    attn_metadata.effective_query_lens,
-                    num_kv_pages_per_compute_block,
-                    num_queries_per_compute_block,
-                    use_kernel=True,
-                    attn_logits_soft_cap=self.logits_soft_cap,
-                )
-        else:
-            # Decoding run.
-            assert kv_cache[0].numel() > 0
-            query = query.squeeze(dim=1)
-            pages_per_compute_block = 16  # TODO(woosuk): Tune this value.
-
-            assert attn_metadata.block_tables is not None
-            assert attn_metadata.context_lens is not None
-            # NOTE(woosuk): The PagedAttention Pallas kernel stores the entire
-            # block table in SMEM. Therefore, if the block table is too large,
-            # the kernel compilation will fail. To avoid this, we split the
-            # batch dimension into smaller chunks and run the kernel multiple
-            # times.
-            MAX_SMEM_USAGE = 512 * 1024
-            size_per_seq = 4 * attn_metadata.block_tables.shape[1]
-            max_num_seq = MAX_SMEM_USAGE // size_per_seq
-
-            if batch_size <= max_num_seq:
-                output = paged_attention(
-                    query,
-                    key_cache,
-                    value_cache,
-                    attn_metadata.context_lens,
-                    attn_metadata.block_tables,
-                    pages_per_compute_block,
-                    self.megacore_mode,
-                    attn_logits_soft_cap=self.logits_soft_cap,
-                )
-            else:
-                chunk_size = max_num_seq
-                # Make sure the chunk size is a multiple of 2.
-                chunk_size = chunk_size // 2 * 2
-                num_chunks = (batch_size + chunk_size - 1) // chunk_size
-
-                output = torch.empty_like(query)
-                for chunk_idx in range(num_chunks):
-                    chunk_start = chunk_idx * chunk_size
-                    chunk_end = chunk_start + chunk_size
-                    # NOTE(woosuk): We skip this line because it causes Dynamo
-                    # compilation error. Instead, we rely on the slice operation
-                    # to handle the out-of-bound case.
-                    # chunk_end = min(chunk_end, batch_size)
-                    chunk_output = paged_attention(
-                        query[chunk_start:chunk_end],
-                        key_cache,
-                        value_cache,
-                        attn_metadata.context_lens[chunk_start:chunk_end],
-                        attn_metadata.block_tables[chunk_start:chunk_end],
-                        pages_per_compute_block,
-                        self.megacore_mode,
-                        attn_logits_soft_cap=self.logits_soft_cap,
-                    )
-                    output[chunk_start:chunk_end] = chunk_output
-
-        # Reshape the output tensor.
-        return output.reshape(batch_size, seq_len, hidden_size)
-
-
-def write_to_kv_cache(
-    key: torch.Tensor,
-    value: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
-    slot_mapping: torch.Tensor,
-) -> None:
-    torch.ops.xla.dynamo_set_buffer_donor_(key_cache, True)
-    torch.ops.xla.dynamo_set_buffer_donor_(value_cache, True)
-
-    key = key.flatten(0, 2)
-    value = value.flatten(0, 2)
-    key_cache = key_cache.flatten(0, 2)
-    value_cache = value_cache.flatten(0, 2)
-    key_cache.index_copy_(0, slot_mapping, key)
-    value_cache.index_copy_(0, slot_mapping, value)
-
-
-def paged_attention(
-    query: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
-    context_lens: torch.Tensor,
-    block_tables: torch.Tensor,
-    pages_per_compute_block: int,
-    megacore_mode: Optional[str],
-    *,
-    attn_logits_soft_cap: Optional[float],
-) -> torch.Tensor:
-    batch_size = query.shape[0]
-    if megacore_mode == "batch" and batch_size % 2 != 0:
-        megacore_mode = None
-    else:
-        megacore_mode = megacore_mode
-
-    return torch.ops.xla.paged_attention(
-        query,
-        key_cache,
-        value_cache,
-        context_lens,
-        block_tables,
-        pages_per_compute_block,
-        megacore_mode=megacore_mode,
-        attn_logits_soft_cap=attn_logits_soft_cap,
-    )
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index af5fe81dc883c..a490aa397991b 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -3,78 +3,24 @@
 """ Attention layer with torch scaled_dot_product_attention
     and PagedAttention."""
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Type
+from typing import Any, Dict, List, Optional
 
 import torch
 from torch.nn.functional import scaled_dot_product_attention
 
 # yapf conflicts with isort for this block
 # yapf: disable
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionLayer,
-                                              AttentionMetadata,
-                                              AttentionMetadataBuilder,
-                                              AttentionType,
+from vllm.attention.backends.abstract import (AttentionImpl, AttentionLayer,
+                                              AttentionMetadata, AttentionType,
                                               is_quantized_kv_cache)
 # yapf: enable
-from vllm.attention.backends.utils import CommonAttentionState
 from vllm.attention.ops.ipex_attn import PagedAttention, _use_ipex
 from vllm.attention.ops.paged_attn import PagedAttentionMetadata
 from vllm.logger import init_logger
-from vllm.utils import make_tensor_with_pad
-from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder
 
 logger = init_logger(__name__)
 
 
-class TorchSDPABackend(AttentionBackend):
-
-    @staticmethod
-    def get_name() -> str:
-        return "TORCH_SDPA"
-
-    @staticmethod
-    def get_impl_cls() -> Type["TorchSDPABackendImpl"]:
-        return TorchSDPABackendImpl
-
-    @staticmethod
-    def get_metadata_cls() -> Type["AttentionMetadata"]:
-        return TorchSDPAMetadata
-
-    @staticmethod
-    def get_state_cls() -> Type["CommonAttentionState"]:
-        return CommonAttentionState
-
-    @staticmethod
-    def get_builder_cls() -> Type["TorchSDPAMetadataBuilder"]:
-        return TorchSDPAMetadataBuilder
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> Tuple[int, ...]:
-        return PagedAttention.get_kv_cache_shape(num_blocks, block_size,
-                                                 num_kv_heads, head_size)
-
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: torch.Tensor,
-    ) -> None:
-        raise NotImplementedError("Swap is not supported in TorchSDPABackend.")
-
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[torch.Tensor],
-        src_to_dists: torch.Tensor,
-    ) -> None:
-        PagedAttention.copy_blocks(kv_caches, src_to_dists)
-
-
 @dataclass
 class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata):
     """Metadata for TorchSDPABackend.
@@ -287,113 +233,6 @@ class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata):
             raise AttributeError(f"Invalid attention type {str(attn_type)}")
 
 
-class TorchSDPAMetadataBuilder(AttentionMetadataBuilder[TorchSDPAMetadata]):
-
-    def __init__(self, input_builder: ModelInputForCPUBuilder) -> None:
-        self.chunked_prefill = input_builder.chunked_prefill
-        self.input_builder = input_builder
-
-    def prepare(self):
-        self.input_data = self.input_builder.input_data
-
-    def build(self, seq_lens: List[int], query_lens: List[int],
-              cuda_graph_pad_size: int, batch_size: int) -> TorchSDPAMetadata:
-        input_data = self.input_data
-        prefill_seq_lens = seq_lens[0:input_data.num_prefills]
-        prefill_query_lens = query_lens[0:input_data.num_prefills]
-        slot_mapping = torch.tensor(input_data.slot_mapping,
-                                    dtype=torch.long,
-                                    device="cpu")
-
-        # For chunked-prefill
-        if self.chunked_prefill and input_data.num_prefill_tokens != 0:
-            prefill_block_tables = make_tensor_with_pad(
-                self.input_data.prefill_block_tables,
-                pad=0,
-                dtype=torch.int32,
-                device="cpu",
-            )
-            query_lens_tensor = torch.tensor(prefill_query_lens,
-                                             dtype=torch.int32,
-                                             device="cpu")
-            kv_lens_tensor = torch.tensor(prefill_seq_lens,
-                                          dtype=torch.int32,
-                                          device="cpu")
-            query_start_loc = torch.zeros(input_data.num_prefills + 1,
-                                          dtype=torch.int32,
-                                          device="cpu")
-            kv_start_loc = torch.zeros(input_data.num_prefills + 1,
-                                       dtype=torch.int32,
-                                       device="cpu")
-            torch.cumsum(query_lens_tensor,
-                         dim=0,
-                         dtype=torch.int32,
-                         out=query_start_loc[1:])
-            torch.cumsum(kv_lens_tensor,
-                         dim=0,
-                         dtype=torch.int32,
-                         out=kv_start_loc[1:])
-            max_query_len = max(prefill_query_lens)
-            max_kv_len = max(prefill_seq_lens)
-        else:
-            prefill_block_tables = None
-            query_start_loc = None
-            kv_start_loc = None
-            max_query_len = None
-            max_kv_len = None
-
-        # For paged attention
-        if input_data.num_decode_tokens != 0:
-            seq_lens_tensor = torch.tensor(
-                input_data.seq_lens[input_data.num_prefills:],
-                dtype=torch.int32,
-                device="cpu",
-            )
-            block_tables = make_tensor_with_pad(
-                self.input_data.decode_block_tables,
-                pad=0,
-                dtype=torch.int32,
-                device="cpu",
-            )
-        else:
-            block_tables = torch.tensor([])
-            seq_lens_tensor = torch.tensor(
-                input_data.seq_lens[:input_data.num_prefills],
-                dtype=torch.int32,
-                device="cpu",
-            )
-
-        # For multi-modal models
-        placeholder_index_maps = None
-        if len(input_data.multi_modal_inputs_list) != 0:
-            placeholder_index_maps = {
-                modality: placeholder_map.index_map()
-                for modality, placeholder_map in
-                input_data.multi_modal_placeholder_maps.items()
-            }
-
-        attn_metadata = TorchSDPAMetadata(
-            chunked_prefill=self.chunked_prefill,
-            seq_lens=prefill_seq_lens,
-            seq_lens_tensor=seq_lens_tensor,
-            max_query_len=max_query_len,
-            max_kv_len=max_kv_len,
-            prefill_query_start_loc=query_start_loc,
-            kv_start_loc=kv_start_loc,
-            max_decode_seq_len=input_data.max_decode_seq_len,
-            num_prefills=input_data.num_prefills,
-            num_prefill_tokens=input_data.num_prefill_tokens,
-            num_decode_tokens=input_data.num_decode_tokens,
-            block_tables=block_tables,
-            prefill_block_tables=prefill_block_tables,
-            slot_mapping=slot_mapping,
-            multi_modal_placeholder_index_maps=placeholder_index_maps,
-            enable_kv_scales_calculation=False,
-        )
-
-        return attn_metadata
-
-
 class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
 
     def __init__(
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index dccd60f4463aa..1050d3c593443 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -64,13 +64,11 @@ class CpuPlatform(Platform):
         if selected_backend and selected_backend != _Backend.TORCH_SDPA:
             logger.info("Cannot use %s backend on CPU.", selected_backend)
         if use_mla:
-            logger.info("Using CPU MLA backend.")
-            return "vllm.attention.backends.cpu_mla.CPUMLABackend"
+            raise NotImplementedError("MLA is not supported on CPU.")
         logger.info("Using Torch SDPA backend.")
-        if use_v1:
-            return "vllm.v1.attention.backends.cpu_attn.TorchSDPABackend"
-        else:
-            return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"
+        if not use_v1:
+            raise ValueError("CPU backend only supports V1.")
+        return "vllm.v1.attention.backends.cpu_attn.TorchSDPABackend"
 
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
@@ -147,26 +145,14 @@ class CpuPlatform(Platform):
                            parallel_config.distributed_executor_backend)
             parallel_config.distributed_executor_backend = "mp"
         if parallel_config.worker_cls == "auto":
-            if vllm_config.speculative_config:
-                parallel_config.worker_cls = \
-                    "vllm.spec_decode.spec_decode_worker.create_spec_worker"
-                parallel_config.sd_worker_cls = \
-                    "vllm.worker.cpu_worker.CPUWorker"
-            else:
-                if envs.VLLM_USE_V1:
-                    parallel_config.worker_cls = \
-                        "vllm.v1.worker.cpu_worker.CPUWorker"
-                else:
-                    parallel_config.worker_cls = \
-                        "vllm.worker.cpu_worker.CPUWorker"
+            parallel_config.worker_cls = "vllm.v1.worker.cpu_worker.CPUWorker"
 
         # Note: workaround for v1 gpu_model_runner
         from vllm.config import CompilationLevel
         vllm_config.compilation_config.cudagraph_capture_sizes = []
 
         compilation_config = vllm_config.compilation_config
-        if (envs.VLLM_USE_V1 and vllm_config.compilation_config.level
-                == CompilationLevel.PIECEWISE):
+        if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE:
 
             # Note: vLLM V1 is using PIECEWISE level compilation, which will
             # take time to compile kernels just-in-time with the inductor
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 0387e348965d7..a8c8cb46de2cc 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -6,7 +6,6 @@ from typing import TYPE_CHECKING, Optional, Union, cast
 import torch
 from tpu_info import device
 
-import vllm.envs as envs
 from vllm.inputs import ProcessorInputs, PromptType
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams, SamplingType
@@ -50,12 +49,10 @@ class TpuPlatform(Platform):
                 and selected_backend != _Backend.PALLAS_VLLM_V1):
             logger.info("Cannot use %s backend on TPU.", selected_backend)
 
-        if use_v1:
-            logger.info("Using Pallas V1 backend.")
-            return "vllm.v1.attention.backends.pallas.PallasAttentionBackend"
-        else:
-            logger.info("Using Pallas backend.")
-            return "vllm.attention.backends.pallas.PallasAttentionBackend"
+        if not use_v1:
+            raise ValueError("TPU backend only supports V1.")
+        logger.info("Using Pallas V1 backend.")
+        return "vllm.v1.attention.backends.pallas.PallasAttentionBackend"
 
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
@@ -68,7 +65,7 @@ class TpuPlatform(Platform):
 
     @classmethod
     def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
-        return not envs.VLLM_USE_V1
+        return False
 
     @classmethod
     def get_punica_wrapper(cls) -> str:
@@ -117,31 +114,19 @@ class TpuPlatform(Platform):
                 "Using bfloat16 instead.", vllm_config.model_config.dtype)
             vllm_config.model_config.dtype = torch.bfloat16
 
-        if envs.VLLM_USE_V1:
-            from vllm.v1.attention.backends.pallas import (
-                PallasAttentionBackend)
-            cache_config.block_size = PallasAttentionBackend.get_page_size(
-                vllm_config)  # type: ignore[assignment]
+        from vllm.v1.attention.backends.pallas import PallasAttentionBackend
+        cache_config.block_size = PallasAttentionBackend.get_page_size(
+            vllm_config)  # type: ignore[assignment]
 
         parallel_config = vllm_config.parallel_config
         scheduler_config = vllm_config.scheduler_config
         if parallel_config.worker_cls == "auto":
             if scheduler_config.is_multi_step:
-                if envs.VLLM_USE_V1:
-                    raise NotImplementedError(
-                        "Multi-step scheduling is not supported (and not "
-                        "needed) on vLLM V1. Please launch without "
-                        "--num-scheduler-steps.")
-                else:
-                    parallel_config.worker_cls = \
-                        "vllm.worker.multi_step_tpu_worker.MultiStepTPUWorker"
-            else:
-                if envs.VLLM_USE_V1:
-                    parallel_config.worker_cls = \
-                        "vllm.v1.worker.tpu_worker.TPUWorker"
-                else:
-                    parallel_config.worker_cls = \
-                        "vllm.worker.tpu_worker.TPUWorker"
+                raise NotImplementedError(
+                    "Multi-step scheduling is not supported (and not "
+                    "needed) on vLLM V1. Please launch without "
+                    "--num-scheduler-steps.")
+            parallel_config.worker_cls = "vllm.v1.worker.tpu_worker.TPUWorker"
 
         assert not vllm_config.speculative_config, (
             "Speculative decoding is not yet supported for TPU backend")
@@ -189,13 +174,9 @@ class TpuPlatform(Platform):
         processed_inputs: ProcessorInputs,
     ) -> None:
         """Raises if this request is unsupported on this platform"""
-        if isinstance(params, SamplingParams):
-            if params.guided_decoding is not None and not envs.VLLM_USE_V1:
-                raise ValueError("Structured output is not supported on "
-                                 f"{cls.device_name} V0.")
-            if params.sampling_type == SamplingType.RANDOM_SEED:
-                raise ValueError(
-                    "Torch XLA does not support per-request seed.")
+        if (isinstance(params, SamplingParams)
+                and params.sampling_type == SamplingType.RANDOM_SEED):
+            raise ValueError("Torch XLA does not support per-request seed.")
 
 
 try:
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 61a0453dcbc87..5bd34033233ab 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -39,12 +39,10 @@ class XPUPlatform(Platform):
         if selected_backend != _Backend.IPEX:
             logger.info("Cannot use %s backend on XPU.", selected_backend)
         use_v1 = envs.VLLM_USE_V1
-        if use_v1:
-            logger.info("Using Flash Attention backend on V1 engine.")
-            return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
-        else:
-            logger.info("Using IPEX attention backend.")
-            return "vllm.attention.backends.ipex_attn.IpexAttnBackend"
+        if not use_v1:
+            raise ValueError("XPU backend only supports V1.")
+        logger.info("Using Flash Attention backend on V1 engine.")
+        return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
 
     @classmethod
     def get_device_capability(
@@ -77,10 +75,7 @@ class XPUPlatform(Platform):
         cache_config = vllm_config.cache_config
         # in V1(or with ipex chunked prefill) block_size is 64
         if cache_config and cache_config.block_size is None:
-            if envs.VLLM_USE_V1:
-                cache_config.block_size = 64
-            else:
-                cache_config.block_size = 16
+            cache_config.block_size = 64
 
         # Instances created using VllmConfig() typically have model_config as
         # None by default. The modification involves adding a check to prevent
@@ -106,11 +101,7 @@ class XPUPlatform(Platform):
 
         # check and update parallel config
         parallel_config = vllm_config.parallel_config
-        if envs.VLLM_USE_V1:
-            parallel_config.worker_cls =\
-                "vllm.v1.worker.xpu_worker.XPUWorker"
-        else:
-            parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker"
+        parallel_config.worker_cls = "vllm.v1.worker.xpu_worker.XPUWorker"
 
         if parallel_config.distributed_executor_backend is None:
             if parallel_config.world_size > 1:
diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py
deleted file mode 100644
index c99e2652a3972..0000000000000
--- a/vllm/worker/cpu_enc_dec_model_runner.py
+++ /dev/null
@@ -1,326 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import dataclasses
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, cast
-
-import torch
-
-from vllm.attention import AttentionMetadata
-from vllm.forward_context import set_forward_context
-from vllm.model_executor import SamplingMetadata
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.multimodal import MultiModalKwargs
-from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
-from vllm.utils import make_tensor_with_pad
-from vllm.worker.cpu_model_runner import (CPUModelRunnerBase,
-                                          ModelInputForCPUBuilder,
-                                          ModelInputForCPUWithSamplingMetadata)
-from vllm.worker.model_runner_base import (
-    _add_attn_metadata_broadcastable_dict,
-    _add_sampling_metadata_broadcastable_dict)
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
-
-@dataclasses.dataclass(frozen=True)
-class EncoderDecoderModelInputForCPU(ModelInputForCPUWithSamplingMetadata):
-    """
-    Used by the EncoderDecoderModelRunner.
-    """
-    encoder_input_tokens: Optional[torch.Tensor] = None
-    encoder_input_positions: Optional[torch.Tensor] = None
-
-    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
-        tensor_dict = {
-            "input_tokens": self.input_tokens,
-            "input_positions": self.input_positions,
-            "encoder_input_tokens": self.encoder_input_tokens,
-            "encoder_input_positions": self.encoder_input_positions,
-            "multi_modal_kwargs": self.multi_modal_kwargs,
-        }
-        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
-        _add_sampling_metadata_broadcastable_dict(tensor_dict,
-                                                  self.sampling_metadata)
-        return tensor_dict
-
-    @classmethod
-    def from_broadcasted_tensor_dict(
-        cls,
-        tensor_dict: Dict[str, Any],
-        attn_backend: Optional["AttentionBackend"] = None,
-    ) -> "EncoderDecoderModelInputForCPU":
-        return cast(
-            EncoderDecoderModelInputForCPU,
-            super().from_broadcasted_tensor_dict(tensor_dict, attn_backend))
-
-
-class CPUEncoderDecoderModelRunner(
-        CPUModelRunnerBase[EncoderDecoderModelInputForCPU]):
-    _model_input_cls: Type[EncoderDecoderModelInputForCPU] = (
-        EncoderDecoderModelInputForCPU)
-    _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder
-
-    def _list_to_int32_tensor(
-        self,
-        _list: List[int],
-    ) -> torch.Tensor:
-        return torch.tensor(_list, dtype=torch.int32, device=self.device)
-
-    def _list_to_long_tensor(
-        self,
-        _list: List[int],
-    ) -> torch.Tensor:
-        return torch.tensor(_list, dtype=torch.long, device=self.device)
-
-    def _empty_int32_tensor(self) -> torch.Tensor:
-        return self._list_to_int32_tensor([])
-
-    def _empty_long_tensor(self) -> torch.Tensor:
-        return self._list_to_long_tensor([])
-
-    def make_model_input_from_broadcasted_tensor_dict(
-            self, tensor_dict: Dict[str,
-                                    Any]) -> EncoderDecoderModelInputForCPU:
-        return EncoderDecoderModelInputForCPU.from_broadcasted_tensor_dict(
-            tensor_dict,
-            attn_backend=self.attn_backend,
-        )
-
-    def prepare_model_input(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        virtual_engine: int = 0,
-        finished_requests_ids: Optional[List[str]] = None
-    ) -> EncoderDecoderModelInputForCPU:
-        model_input = self._prepare_model_input_tensors(
-            seq_group_metadata_list, finished_requests_ids)
-        (
-            attn_metadata,
-            encoder_input_tokens_tensor,
-            encoder_input_positions_tensor,
-        ) = self._prepare_encoder_model_input_tensors(seq_group_metadata_list,
-                                                      model_input)
-        # Sampling metadata is only required for the final pp group
-        generators = self.get_generators(finished_requests_ids)
-        sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
-                                                     model_input.seq_lens,
-                                                     model_input.query_lens,
-                                                     self.device,
-                                                     pin_memory=False,
-                                                     generators=generators)
-        return dataclasses.replace(
-            model_input,
-            sampling_metadata=sampling_metadata,
-            attn_metadata=attn_metadata,
-            encoder_input_tokens=encoder_input_tokens_tensor,
-            encoder_input_positions=encoder_input_positions_tensor,
-            virtual_engine=virtual_engine,
-        )
-
-    def _prepare_encoder_model_input_tensors(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        model_input: EncoderDecoderModelInputForCPU,
-    ) -> Tuple[AttentionMetadata, Optional[torch.Tensor],
-               Optional[torch.Tensor]]:
-        """Helper method to prepare the encoder- and cross-attn-related
-        model inputs based on a given sequence group. These additional inputs
-        are used to augment an already-computed `EncoderDecoderModelInput`
-        data structure which already has decoder-related model inputs
-        populated.
-
-        Sets the following attn_metadata fields:
-        * `num_encoder_tokens`
-        * `encoder_seq_lens`
-        * `encoder_seq_lens_tensor`
-        * `max_encoder_seq_len`
-        * `cross_slot_mapping`
-        * `cross_block_tables`
-
-        Constructs a new model inputs data structure, based on
-        (1) the existing fields in the `model_inputs` argument,
-        and (2) the following additional fields which are
-        computed (or in the case of `attn_metadata`, updated) 
-        by this function:
-        * attn_metadata
-        * encoder_input_tokens
-        * encoder_input_positions
-
-        Arguments:
-
-        * seq_group_metadata_list: list of sequence groups for which to
-                                   compute inputs
-        * model_inputs: model inputs data structure with decoder-oriented
-                        fields already computed.
-
-        Return:
-
-        * Updated model inputs data structure
-        """
-
-        if len(seq_group_metadata_list) == 0:
-            return (model_input.attn_metadata, None, None)
-
-        # Since we are not supporting chunked prefill either the entire
-        # batch is prefill or it is decode
-        is_prompt = seq_group_metadata_list[0].is_prompt
-
-        # Build encoder inputs
-        encoder_seq_lens: List[int] = []
-        if is_prompt:
-            # Prefill phase.
-            cross_block_tables = self._empty_int32_tensor().view(
-                len(seq_group_metadata_list), -1)
-
-            # Extract input tokens/positions, cross-attention slot-mapping,
-            # & seq len from each sequence group metadata
-            (
-                encoder_input_tokens,
-                encoder_input_positions,
-                cross_slot_mapping,
-            ) = (
-                [],
-                [],
-                [],
-            )
-            for seq_group_metadata in seq_group_metadata_list:
-                # Build seq lens
-                seq_len = seq_group_metadata.encoder_seq_data.get_len()
-                token_ids = seq_group_metadata.encoder_seq_data.get_token_ids()
-                encoder_seq_lens.append(seq_len)
-
-                # Build slot mapping
-                for i in range(0, seq_len):
-                    block_number = seq_group_metadata.cross_block_table[
-                        i // self.block_size]
-                    block_offset = i % self.block_size
-                    slot = block_number * self.block_size + block_offset
-                    cross_slot_mapping.append(slot)
-
-                # Build encoder input tokens
-                encoder_input_tokens.extend(token_ids)
-                encoder_input_positions.extend(list(range(0, seq_len)))
-
-            # Convert tokens/positions & cross-attention
-            # slot-mapping to encoder input tensors
-            encoder_input_tokens_tensor = self._list_to_long_tensor(
-                encoder_input_tokens)
-            encoder_input_positions_tensor = self._list_to_long_tensor(
-                encoder_input_positions)
-            cross_slot_mapping_tensor = self._list_to_long_tensor(
-                cross_slot_mapping)
-
-        else:
-            # Decode phase.
-            encoder_input_tokens_tensor = self._empty_long_tensor()
-            encoder_input_positions_tensor = self._empty_long_tensor()
-            cross_slot_mapping_tensor = self._empty_long_tensor()
-            # Extract cross-attention block tables &
-            # seq len from each sequence group metadata.
-            # Cross-attention block tables are empty
-            # during vLLM memory profiling.
-            cross_block_tables = []
-            for seq_group_metadata in seq_group_metadata_list:
-                for _ in range(len(seq_group_metadata.seq_data)):
-                    encoder_seq_lens.append(
-                        seq_group_metadata.encoder_seq_data.get_len())
-                    cross_block_table = seq_group_metadata.cross_block_table
-                    cross_block_tables.append([] if (
-                        cross_block_table is None) else cross_block_table)
-
-            max_len_of_block_table = max(
-                len(block_table) for block_table in cross_block_tables)
-
-            cross_block_tables = make_tensor_with_pad(
-                cross_block_tables,
-                max_len=max_len_of_block_table,
-                pad=0,
-                dtype=torch.int32,
-                device=self.device,
-            )
-
-        # Compute encoder sequence lengths & encoder
-        # sequence starting offset tensors
-        max_encoder_seq_len = max(encoder_seq_lens, default=0)
-        encoder_seq_lens_tensor = self._list_to_int32_tensor(encoder_seq_lens)
-        encoder_seq_start_loc = torch.zeros(encoder_seq_lens_tensor.shape[0] +
-                                            1,
-                                            dtype=torch.int32,
-                                            device=self.device)
-        torch.cumsum(encoder_seq_lens_tensor,
-                     dim=0,
-                     dtype=encoder_seq_start_loc.dtype,
-                     out=encoder_seq_start_loc[1:])
-
-        # Update attention metadata with encoder-oriented attributes
-        attn_metadata = model_input.attn_metadata
-        assert attn_metadata is not None
-        (
-            attn_metadata.num_encoder_tokens,
-            attn_metadata.encoder_seq_lens,
-            attn_metadata.encoder_seq_lens_tensor,
-            attn_metadata.max_encoder_seq_len,
-            attn_metadata.cross_slot_mapping,
-            attn_metadata.cross_block_tables,
-        ) = (
-            sum(encoder_seq_lens),
-            encoder_seq_lens,
-            encoder_seq_lens_tensor,
-            max_encoder_seq_len,
-            cross_slot_mapping_tensor,
-            cross_block_tables,
-        )
-
-        return (attn_metadata, encoder_input_tokens_tensor,
-                encoder_input_positions_tensor)
-
-    @torch.no_grad()
-    def execute_model(
-        self,
-        model_input: EncoderDecoderModelInputForCPU,
-        kv_caches: List[torch.Tensor],
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-    ) -> Optional[List[SamplerOutput]]:
-        if num_steps > 1:
-            raise ValueError(
-                "CPU worker does not support multi-step execution.")
-
-        model_executable = self.model
-        execute_model_kwargs = {
-            "input_ids":
-            model_input.input_tokens,
-            "positions":
-            model_input.input_positions,
-            "encoder_input_ids":
-            model_input.encoder_input_tokens,
-            "encoder_positions":
-            model_input.encoder_input_positions,
-            **MultiModalKwargs.as_kwargs(
-                model_input.multi_modal_kwargs or {},
-                device=self.device,
-            ),
-            "intermediate_tensors":
-            intermediate_tensors,
-        }
-
-        with set_forward_context(model_input.attn_metadata, self.vllm_config,
-                                 model_input.virtual_engine):
-            hidden_states = model_executable(**execute_model_kwargs)
-
-        # Compute the logits.
-        logits = self.model.compute_logits(hidden_states,
-                                           model_input.sampling_metadata)
-
-        # Only perform sampling in the driver worker.
-        if not self.is_driver_worker:
-            return []
-
-        # Sample the next token.
-        output = self.sampler(
-            logits=logits,
-            sampling_metadata=model_input.sampling_metadata,
-        )
-        return [output]
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
deleted file mode 100644
index 68cdf65cafa79..0000000000000
--- a/vllm/worker/cpu_model_runner.py
+++ /dev/null
@@ -1,671 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import dataclasses
-import weakref
-from collections import defaultdict
-from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Set, Type,
-                    TypeVar, Union)
-
-import torch
-from torch import nn
-
-from vllm.attention import AttentionMetadata, get_attn_backend
-from vllm.config import VllmConfig
-from vllm.forward_context import set_forward_context
-from vllm.logger import init_logger
-from vllm.lora.layers import LoRAMapping
-from vllm.lora.request import LoRARequest
-from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
-from vllm.model_executor import SamplingMetadata
-from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
-from vllm.model_executor.model_loader import get_model
-from vllm.model_executor.models import supports_lora, supports_multimodal
-from vllm.multimodal import (BatchedTensorInputs, MultiModalKwargs,
-                             MultiModalPlaceholderMap)
-from vllm.sequence import (IntermediateTensors, SequenceData,
-                           SequenceGroupMetadata)
-from vllm.worker.model_runner_base import (
-    ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
-    _add_attn_metadata_broadcastable_dict,
-    _add_sampling_metadata_broadcastable_dict,
-    _init_attn_metadata_from_tensor_dict,
-    _init_sampling_metadata_from_tensor_dict)
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
-logger = init_logger(__name__)
-
-TModelInputForCPU = TypeVar('TModelInputForCPU', bound="ModelInputForCPU")
-_PAD_SLOT_ID = -1
-
-
-@dataclass(frozen=True)
-class ModelInputForCPU(ModelRunnerInputBase):
-    """
-    Base class contains metadata needed for the base model forward pass on CPU
-    """
-    input_tokens: Optional[torch.Tensor] = None
-    input_positions: Optional[torch.Tensor] = None
-    token_type_ids: Optional[torch.Tensor] = None
-    attn_metadata: Optional["AttentionMetadata"] = None
-    multi_modal_kwargs: Optional[BatchedTensorInputs] = None
-    virtual_engine: Optional[int] = None
-    seq_lens: Optional[List[int]] = None
-    query_lens: Optional[List[int]] = None
-    lora_mapping: Optional["LoRAMapping"] = None
-    lora_requests: Optional[Set[LoRARequest]] = None
-
-    def as_broadcastable_tensor_dict(
-            self) -> Dict[str, Union[int, torch.Tensor]]:
-        tensor_dict = {
-            "input_tokens": self.input_tokens,
-            "input_positions": self.input_positions,
-            "token_type_ids": self.token_type_ids,
-            "multi_modal_kwargs": self.multi_modal_kwargs,
-            "lora_requests": self.lora_requests,
-            "lora_mapping": self.lora_mapping,
-        }
-        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
-
-        return tensor_dict
-
-    @classmethod
-    def from_broadcasted_tensor_dict(
-        cls: Type[TModelInputForCPU],
-        tensor_dict: Dict[str, Any],
-        attn_backend: Optional["AttentionBackend"] = None
-    ) -> TModelInputForCPU:
-        if attn_backend is not None:
-            tensor_dict = _init_attn_metadata_from_tensor_dict(
-                attn_backend, tensor_dict)
-        return cls(**tensor_dict)
-
-
-@dataclass(frozen=True)
-class ModelInputForCPUWithSamplingMetadata(ModelInputForCPU):
-    """
-    Used by the ModelRunner.
-    """
-    sampling_metadata: Optional["SamplingMetadata"] = None
-    is_prompt: Optional[bool] = None
-
-    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
-        tensor_dict = {
-            "input_tokens": self.input_tokens,
-            "input_positions": self.input_positions,
-            "token_type_ids": self.token_type_ids,
-            "multi_modal_kwargs": self.multi_modal_kwargs,
-        }
-        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
-        _add_sampling_metadata_broadcastable_dict(tensor_dict,
-                                                  self.sampling_metadata)
-        return tensor_dict
-
-    @classmethod
-    def from_broadcasted_tensor_dict(
-        cls,
-        tensor_dict: Dict[str, Any],
-        attn_backend: Optional["AttentionBackend"] = None,
-    ) -> "ModelInputForCPUWithSamplingMetadata":
-        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
-        if attn_backend is not None:
-            tensor_dict = _init_attn_metadata_from_tensor_dict(
-                attn_backend, tensor_dict)
-        return cls(**tensor_dict)
-
-
-class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]):
-
-    class ModelInputData:
-
-        def __init__(self, use_mrope: bool):
-            self.use_mrope = use_mrope
-            self.input_tokens: List[int] = []
-            self.input_positions: List[int] = []
-            self.token_type_ids: Optional[List[int]] = []
-            self.seq_lens: List[int] = []
-            self.query_lens: List[int] = []
-            self.prefill_block_tables: List[List[int]] = []
-            self.decode_block_tables: List[List[int]] = []
-            self.max_decode_seq_len: int = 0
-            self.num_prefills: int = 0
-            self.num_prefill_tokens: int = 0
-            self.num_decode_tokens: int = 0
-            self.slot_mapping: List[int] = []
-            self.multi_modal_inputs_list: List[MultiModalKwargs] = []
-            self.multi_modal_placeholder_maps: Dict[
-                str, MultiModalPlaceholderMap] = defaultdict(
-                    MultiModalPlaceholderMap)
-            self.input_mrope_positions: List[List[int]] = [[]
-                                                           for _ in range(3)]
-
-    def __init__(self,
-                 runner: "CPUModelRunner",
-                 finished_requests_ids: Optional[List[str]] = None) -> None:
-        super().__init__()
-        self.runner = runner
-        self.chunked_prefill = (runner.scheduler_config.chunked_prefill_enabled
-                                or runner.cache_config.enable_prefix_caching)
-        self.model_input_cls = self.runner._model_input_cls
-        self.attn_backend = self.runner.attn_backend
-        self.sliding_window = self.runner.sliding_window
-        self.block_size = self.runner.block_size
-        self.device = self.runner.device
-        self.enable_lora = self.runner.lora_config is not None
-        if self.runner.attn_backend is not None:
-            # spec decode (e.g. Medusa) does not have atten backend
-            attn_backend = self.runner.attn_backend
-            self.att_metadata_builder = attn_backend.get_builder_cls()(self)
-
-    def prepare(self,
-                finished_requests_ids: Optional[List[str]] = None) -> None:
-        self.seq_group_metadata_list: List[SequenceGroupMetadata] = []
-        self.input_data = ModelInputForCPUBuilder.ModelInputData(
-            self.runner.model_config.uses_mrope)
-        self.att_metadata_builder.prepare()
-
-    def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata):
-        self.seq_group_metadata_list.append(seq_group_metadata)
-
-    def set_seq_group_list(
-            self, seq_group_metadata_list: List[SequenceGroupMetadata]):
-        self.seq_group_metadata_list = seq_group_metadata_list
-
-    def build(self) -> ModelInputForCPU:
-        self._build_input_data()
-
-        input_data = self.input_data
-        input_tokens = torch.tensor(input_data.input_tokens,
-                                    dtype=torch.long,
-                                    device="cpu")
-        input_positions = torch.tensor(
-            input_data.input_positions
-            if not any(input_data.input_mrope_positions) else
-            input_data.input_mrope_positions,
-            dtype=torch.long,
-            device="cpu")
-        token_type_ids = torch.tensor(input_data.token_type_ids,
-                                    dtype=torch.long,
-                                    device="cpu") \
-                                    if input_data.token_type_ids else None
-
-        # For multi-modal models
-        multi_modal_kwargs = None
-        if len(input_data.multi_modal_inputs_list) != 0:
-            multi_modal_kwargs = MultiModalKwargs.batch(
-                input_data.multi_modal_inputs_list)
-
-        attn_metadata = self.att_metadata_builder.build(
-            input_data.seq_lens, input_data.query_lens, -1, -1)
-
-        is_prompt = (self.seq_group_metadata_list[0].is_prompt
-                     if self.seq_group_metadata_list else None)
-        # LoRA data.
-        lora_requests = set()
-        lora_mapping = None
-        if self.enable_lora:
-            lora_requests = set(seq.lora_request
-                                for seq in self.seq_group_metadata_list
-                                if seq.lora_request is not None)
-
-            lora_mapping = self._prepare_lora_input(
-                self.seq_group_metadata_list, is_prompt)
-
-        return self.model_input_cls(input_tokens=input_tokens,
-                                    input_positions=input_positions,
-                                    token_type_ids=token_type_ids,
-                                    seq_lens=input_data.seq_lens,
-                                    query_lens=input_data.query_lens,
-                                    attn_metadata=attn_metadata,
-                                    multi_modal_kwargs=multi_modal_kwargs,
-                                    lora_mapping=lora_mapping,
-                                    lora_requests=lora_requests)
-
-    def _build_input_data(self):
-        for seq_group_metadata in self.seq_group_metadata_list:
-            for seq_id, seq_data in seq_group_metadata.seq_data.items():
-                if seq_group_metadata.is_prompt:
-                    self._compute_prompt_input_tokens(self.input_data,
-                                                      seq_group_metadata,
-                                                      seq_data, seq_id)
-                    if seq_group_metadata.multi_modal_data:
-                        self._compute_multi_modal_input(
-                            seq_group_metadata, seq_data)
-                else:
-                    self._compute_decode_input_tokens(self.input_data,
-                                                      seq_group_metadata,
-                                                      seq_data, seq_id)
-
-    def _compute_decode_input_tokens(self, data: ModelInputData,
-                                     seq_group_metadata: SequenceGroupMetadata,
-                                     seq_data: SequenceData, seq_id: int):
-        """
-        Compute decode input tokens, positions, block table and slot mapping.
-        """
-        block_size = self.runner.block_size
-
-        block_table = seq_group_metadata.block_tables[seq_id]
-        seq_len = seq_data.get_len()
-        context_len = seq_data.get_num_computed_tokens()
-
-        tokens = seq_data.get_last_token_id()
-        token_positions = seq_len - 1
-        block_number = block_table[token_positions // block_size]
-        block_offset = token_positions % block_size
-        slot = block_number * block_size + block_offset
-
-        # For paged_attention kernel
-        if self.runner.sliding_window:
-            start_idx = max(0, seq_len - self.runner.sliding_window)
-            start_block = start_idx // block_size
-            start_idx = start_block * block_size
-            seq_len = seq_len - start_idx
-            block_table = block_table[start_block:]
-
-        # For MRotaryEmbedding
-        if seq_data.mrope_position_delta is not None:
-            next_pos = MRotaryEmbedding.get_next_input_positions(
-                seq_data.mrope_position_delta,
-                context_len,
-                seq_len,
-            )
-            for idx in range(3):
-                data.input_mrope_positions[idx].extend(  # type: ignore
-                    next_pos[idx])
-        else:
-            data.input_positions.append(token_positions)  # type: ignore
-
-        # Update fields
-        data.input_tokens.append(tokens)
-        data.max_decode_seq_len = max(data.max_decode_seq_len, seq_len)
-        data.num_decode_tokens += 1
-        data.slot_mapping.append(slot)
-        data.decode_block_tables.append(block_table)
-        data.query_lens.append(1)
-        data.seq_lens.append(seq_len)
-
-    def _compute_prompt_input_tokens(self, data: ModelInputData,
-                                     seq_group_metadata: SequenceGroupMetadata,
-                                     seq_data: SequenceData, seq_id: int):
-        """
-        Compute prompt input tokens, positions, block table and slot mapping.
-        """
-        token_chunk_size = seq_group_metadata.token_chunk_size
-        block_size = self.runner.block_size
-
-        block_table = seq_group_metadata.block_tables[seq_id]
-        seq_len = seq_data.get_len()
-        context_len = seq_data.get_num_computed_tokens()
-        seq_len = min(seq_len, context_len + token_chunk_size)
-
-        # For prefix caching
-        prefix_cache_block_num = len(seq_group_metadata.computed_block_nums)
-        if prefix_cache_block_num > 0:
-            prefix_cache_len = (prefix_cache_block_num *
-                                self.runner.block_size)
-            if prefix_cache_len <= context_len:
-                # We already passed the cache hit region,
-                # so do normal computation.
-                pass
-            elif context_len < prefix_cache_len < seq_len:
-                # Partial hit. Compute the missing part.
-                context_len = prefix_cache_len
-                token_chunk_size = seq_len - context_len
-            elif seq_len <= prefix_cache_len:
-                # Full hit. Only compute the last token to avoid
-                # erroneous behavior. FIXME: Ideally we should directly
-                # mark all tokens as computed in the scheduler and do not
-                # schedule this sequence, so this case should not happen.
-                context_len = seq_len - 1
-                token_chunk_size = 1
-
-        tokens = seq_data.get_token_ids()
-        tokens = tokens[context_len:seq_len]
-        token_positions = range(context_len, seq_len)
-        token_types = seq_group_metadata.token_type_ids
-
-        # For encoder-only models, the block_table is None,
-        # and there is no need to initialize the slot_mapping.
-        if block_table is not None:
-            slot_mapping = [_PAD_SLOT_ID] * len(token_positions)
-            for i, pos in enumerate(token_positions):
-                block_number = block_table[pos // block_size]
-                block_offset = pos % block_size
-                slot = block_number * block_size + block_offset
-                slot_mapping[i] = slot
-            data.slot_mapping.extend(slot_mapping)
-
-        # The MROPE positions are prepared in _compute_multi_modal_input
-        data.input_positions.extend(token_positions)
-
-        if data.token_type_ids is not None:
-            data.token_type_ids.extend(token_types if token_types else [])
-
-        # Update fields
-        data.input_tokens.extend(tokens)
-        data.num_prefills += 1
-        data.num_prefill_tokens += len(tokens)
-        data.query_lens.append(len(tokens))
-        data.prefill_block_tables.append(block_table)
-        data.seq_lens.append(seq_len)
-
-    def _compute_multi_modal_input(self,
-                                   seq_group_metadata: SequenceGroupMetadata,
-                                   seq_data: SequenceData):
-        computed_len = seq_data.get_num_computed_tokens()
-        seq_len = self.input_data.seq_lens[-1]
-
-        # NOTE: mm_kwargs only includes the subset of multi-modal items that
-        # intersect with the current prefill positions.
-        mm_kwargs, placeholder_maps = MultiModalPlaceholderMap.from_seq_group(
-            seq_group_metadata, range(computed_len, seq_len))
-
-        if not mm_kwargs:
-            return
-
-        # special processing for mrope position deltas.
-        if self.runner.model_config.uses_mrope:
-            assert not self.chunked_prefill, \
-                "MROPE on CPU does not support chunked-prefill."
-
-            image_grid_thw = mm_kwargs.get("image_grid_thw", None)
-            video_grid_thw = mm_kwargs.get("video_grid_thw", None)
-            audio_feature_lengths = mm_kwargs.get("audio_feature_lengths",
-                                                  None)
-            assert (
-                image_grid_thw is not None or video_grid_thw is not None
-                or audio_feature_lengths is not None), (
-                    "mrope embedding type requires multi-modal input mapper "
-                    "returns 'image_grid_thw' or 'video_grid_thw' or "
-                    "'audio_feature_lengths'.")
-
-            second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None)
-            use_audio_in_video = mm_kwargs.get("use_audio_in_video", False)
-            hf_config = self.runner.model_config.hf_config
-            token_ids = seq_data.get_token_ids()
-
-            mrope_positions, mrope_position_delta = \
-                MRotaryEmbedding.get_input_positions(
-                    token_ids,
-                    hf_config=hf_config,
-                    image_grid_thw=image_grid_thw,
-                    video_grid_thw=video_grid_thw,
-                    second_per_grid_ts=second_per_grid_ts,
-                    context_len=computed_len,
-                    audio_feature_lengths=audio_feature_lengths,
-                    use_audio_in_video=use_audio_in_video,
-                )
-            seq_data.mrope_position_delta = mrope_position_delta
-
-            for i in range(3):
-                self.input_data.input_mrope_positions[  # type: ignore
-                    i].extend(mrope_positions[i])
-
-        self.input_data.multi_modal_inputs_list.append(mm_kwargs)
-        for modality, placeholder_map in placeholder_maps.items():
-            self.input_data.multi_modal_placeholder_maps[modality].extend(
-                placeholder_map)
-
-    def _prepare_lora_input(
-            self, seq_group_metadata_list: List[SequenceGroupMetadata],
-            is_prefill: bool) -> LoRAMapping:
-        index_mapping = []
-        prompt_mapping = []
-        for seq in seq_group_metadata_list:
-            lora_id = seq.lora_int_id
-            query_len = seq.token_chunk_size
-
-            index_mapping += [lora_id] * query_len
-            prompt_mapping += [lora_id] * (
-                query_len if seq.sampling_params
-                and seq.sampling_params.prompt_logprobs is not None else 1)
-
-        return LoRAMapping(index_mapping=tuple(index_mapping),
-                           prompt_mapping=tuple(prompt_mapping),
-                           is_prefill=is_prefill)
-
-
-class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]):
-    """
-    Helper class for shared methods between CPU model runners.
-    """
-    _model_input_cls: Type[TModelInputForCPU]
-    _builder_cls: Type[ModelInputForCPUBuilder]
-    builder: ModelInputForCPUBuilder
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        kv_cache_dtype: Optional[str] = "auto",
-        is_driver_worker: bool = False,
-        return_hidden_states: bool = False,
-        *args,
-        **kwargs,
-    ):
-        ModelRunnerBase.__init__(self, vllm_config)
-        model_config = self.model_config
-        cache_config = self.cache_config
-
-        self.is_driver_worker = is_driver_worker
-        self.return_hidden_states = return_hidden_states
-
-        self.device = self.device_config.device
-        self.pin_memory = False
-
-        self.kv_cache_dtype = kv_cache_dtype
-        self.sliding_window = model_config.get_sliding_window()
-        self.block_size = cache_config.block_size
-        num_attn_heads = self.model_config.get_num_attention_heads(
-            self.parallel_config)
-        needs_attn_backend = (num_attn_heads != 0
-                              or self.model_config.is_attention_free)
-        self.attn_backend = get_attn_backend(
-            self.model_config.get_head_size(),
-            self.model_config.dtype,
-            self.kv_cache_dtype,
-            self.block_size,
-            self.model_config.is_attention_free,
-            use_mla=self.model_config.use_mla,
-        ) if needs_attn_backend else None
-
-        # Lazy initialization.
-        self.model: nn.Module  # Set after init_Model
-        # Set after load_model.
-        self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None
-        self.sampler = get_sampler()
-
-        if hasattr(self, "_builder_cls"):
-            # multi-step model runner does not have `_builder_cls`
-            self.builder = self._builder_cls(weakref.proxy(self))
-
-    def load_model(self) -> None:
-        self.model = get_model(vllm_config=self.vllm_config)
-
-        if self.lora_config:
-            assert supports_lora(
-                self.model
-            ), f"{self.model.__class__.__name__} does not support LoRA yet."
-
-            if supports_multimodal(self.model):
-                logger.warning("Regarding multimodal models, vLLM currently "
-                               "only supports adding LoRA to language model.")
-
-            # Use get_text_config() in case of multimodal models
-            text_config = self.model_config.hf_config.get_text_config()
-
-            self.lora_manager = LRUCacheWorkerLoRAManager(
-                self.scheduler_config.max_num_seqs,
-                self.scheduler_config.max_num_batched_tokens,
-                self.vocab_size,
-                self.lora_config,
-                self.device,
-                self.model.embedding_modules,
-                self.model.embedding_padding_modules,
-                max_position_embeddings=text_config.max_position_embeddings,
-            )
-            self.model = self.lora_manager.create_lora_manager(self.model)
-
-    def get_model(self) -> nn.Module:
-        return self.model
-
-    def _prepare_model_input_tensors(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        finished_requests_ids: Optional[List[str]] = None
-    ) -> TModelInputForCPU:
-        """Helper method to prepare the model input based on a given sequence
-        group. Prepares metadata needed for the base model forward pass but not
-        metadata for possible additional steps, e.g., sampling.
-
-        """
-        self.builder.prepare(finished_requests_ids)
-        self.builder.set_seq_group_list(seq_group_metadata_list)
-
-        return self.builder.build()  # type: ignore
-
-    @property
-    def vocab_size(self) -> int:
-        return self.model_config.get_vocab_size()
-
-    def remove_all_loras(self):
-        if not self.lora_manager:
-            raise RuntimeError("LoRA is not enabled.")
-        self.lora_manager.remove_all_adapters()
-
-    def set_active_loras(self, lora_requests: Set[LoRARequest],
-                         lora_mapping: LoRAMapping) -> None:
-        if not self.lora_manager:
-            raise RuntimeError("LoRA is not enabled.")
-        self.lora_manager.set_active_adapters(lora_requests, lora_mapping)
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        if not self.lora_manager:
-            raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.add_adapter(lora_request)
-
-    def remove_lora(self, lora_id: int) -> bool:
-        if not self.lora_manager:
-            raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.remove_adapter(lora_id)
-
-    def pin_lora(self, lora_id: int) -> bool:
-        if not self.lora_manager:
-            raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.pin_adapter(lora_id)
-
-    def list_loras(self) -> Set[int]:
-        if not self.lora_manager:
-            raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.list_adapters()
-
-
-class CPUModelRunner(CPUModelRunnerBase[ModelInputForCPUWithSamplingMetadata]):
-    _model_input_cls: Type[ModelInputForCPUWithSamplingMetadata] = (
-        ModelInputForCPUWithSamplingMetadata)
-    _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder
-
-    def make_model_input_from_broadcasted_tensor_dict(
-        self,
-        tensor_dict: Dict[str, Any],
-    ) -> ModelInputForCPUWithSamplingMetadata:
-        return ModelInputForCPUWithSamplingMetadata.from_broadcasted_tensor_dict(  # noqa: E501
-            tensor_dict,
-            attn_backend=self.attn_backend,
-        )
-
-    def prepare_model_input(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        virtual_engine: int = 0,
-        finished_requests_ids: Optional[List[str]] = None
-    ) -> ModelInputForCPUWithSamplingMetadata:
-        """Prepare the model input based on a given sequence group, including
-        metadata for the sampling step.
-
-        """
-        model_input = self._prepare_model_input_tensors(
-            seq_group_metadata_list, finished_requests_ids)
-        # Sampling metadata is only required for the final pp group
-        generators = self.get_generators(finished_requests_ids)
-        sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
-                                                     model_input.seq_lens,
-                                                     model_input.query_lens,
-                                                     self.device,
-                                                     pin_memory=False,
-                                                     generators=generators)
-
-        is_prompt = (seq_group_metadata_list[0].is_prompt
-                     if seq_group_metadata_list else None)
-        return dataclasses.replace(model_input,
-                                   sampling_metadata=sampling_metadata,
-                                   virtual_engine=virtual_engine,
-                                   is_prompt=is_prompt)
-
-    @torch.no_grad()
-    def execute_model(
-        self,
-        model_input: ModelInputForCPUWithSamplingMetadata,
-        kv_caches: List[torch.Tensor],
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-        previous_hidden_states: Optional[torch.Tensor] = None,
-    ) -> Optional[List[SamplerOutput]]:
-        if num_steps > 1:
-            raise ValueError(
-                "CPU worker does not support multi-step execution.")
-
-        if self.lora_config:
-            assert model_input.lora_requests is not None
-            assert model_input.lora_mapping is not None
-            self.set_active_loras(model_input.lora_requests,
-                                  model_input.lora_mapping)
-
-        model_executable = self.model
-
-        multimodal_kwargs = {}
-        if model_input.multi_modal_kwargs is not None:
-            multimodal_kwargs = MultiModalKwargs.as_kwargs(
-                model_input.multi_modal_kwargs,
-                device=self.device,
-            )
-        execute_model_kwargs = {}
-        if previous_hidden_states is not None:
-            execute_model_kwargs.update(
-                {"previous_hidden_states": previous_hidden_states})
-
-        with set_forward_context(model_input.attn_metadata, self.vllm_config,
-                                 model_input.virtual_engine):
-            hidden_states = model_executable(
-                input_ids=model_input.input_tokens,
-                positions=model_input.input_positions,
-                intermediate_tensors=intermediate_tensors,
-                **execute_model_kwargs,
-                **multimodal_kwargs,
-            )
-
-        # Compute the logits.
-        logits = self.model.compute_logits(hidden_states,
-                                           model_input.sampling_metadata)
-
-        # Only perform sampling in the driver worker.
-        if not self.is_driver_worker:
-            return []
-
-        # Sample the next token.
-        output = self.sampler(
-            logits=logits,
-            sampling_metadata=model_input.sampling_metadata,
-        )
-        if self.return_hidden_states:
-            # we only need to pass hidden states of most recent token
-            if model_input.is_prompt:
-                output.prefill_hidden_states = hidden_states
-            output.hidden_states = hidden_states
-        return [output]
-
-    def generate_proposals(self, *args, **kwargs):
-        return self.model.generate_proposals(*args, **kwargs)
diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py
deleted file mode 100644
index 203fdf225a41a..0000000000000
--- a/vllm/worker/cpu_pooling_model_runner.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import dataclasses
-from typing import Any, Dict, List, Optional, Tuple, Type, Union
-
-import torch
-
-from vllm.forward_context import set_forward_context
-from vllm.model_executor.pooling_metadata import PoolingMetadata
-from vllm.multimodal import MultiModalKwargs
-from vllm.pooling_params import PoolingParams
-from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData,
-                           SequenceGroupMetadata)
-from vllm.worker.cpu_model_runner import (CPUModelRunnerBase, ModelInputForCPU,
-                                          ModelInputForCPUBuilder)
-
-
-@dataclasses.dataclass(frozen=True)
-class ModelInputForCPUWithPoolingMetadata(ModelInputForCPU):
-    """
-    Used by the CPUPoolingModelRunner.
-    """
-    pooling_metadata: Optional["PoolingMetadata"] = None
-
-
-class CPUPoolingModelRunner(
-        CPUModelRunnerBase[ModelInputForCPUWithPoolingMetadata]):
-    _model_input_cls: Type[ModelInputForCPUWithPoolingMetadata] = (
-        ModelInputForCPUWithPoolingMetadata)
-    _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input: ModelInputForCPUWithPoolingMetadata,
-        kv_caches: List[torch.Tensor],
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-    ) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]:
-        if num_steps > 1:
-            raise ValueError(
-                "CPU worker does not support multi-step execution.")
-
-        model_executable = self.model
-        cross_enc_kwargs = {}
-        if model_input.token_type_ids is not None:
-            cross_enc_kwargs["token_type_ids"] = model_input.token_type_ids
-        execute_model_kwargs = {
-            "input_ids":
-            model_input.input_tokens,
-            "positions":
-            model_input.input_positions,
-            **MultiModalKwargs.as_kwargs(
-                model_input.multi_modal_kwargs or {},
-                device=self.device,
-            ),
-            **cross_enc_kwargs,
-            "intermediate_tensors":
-            intermediate_tensors,
-        }
-
-        with set_forward_context(model_input.attn_metadata, self.vllm_config,
-                                 model_input.virtual_engine):
-            hidden_states = model_executable(**execute_model_kwargs)
-
-        # Only perform pooling in the driver worker.
-        if not self.is_driver_worker:
-            return []
-
-        return [
-            self.model.pooler(hidden_states=hidden_states,
-                              pooling_metadata=model_input.pooling_metadata)
-        ]
-
-    def make_model_input_from_broadcasted_tensor_dict(
-            self,
-            tensor_dict: Dict[str,
-                              Any]) -> ModelInputForCPUWithPoolingMetadata:
-        return ModelInputForCPUWithPoolingMetadata.from_broadcasted_tensor_dict(
-            tensor_dict,
-            attn_backend=self.attn_backend,
-        )
-
-    def prepare_model_input(
-        self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
-        virtual_engine: int = 0,
-        finished_requests_ids: Optional[List[str]] = None
-    ) -> ModelInputForCPUWithPoolingMetadata:
-        assert seq_group_metadata_list is not None
-        model_input = self._prepare_model_input_tensors(
-            seq_group_metadata_list, finished_requests_ids)
-        # Prepare PoolingMetadata.
-        assert model_input.seq_lens is not None
-        pooling_metadata = self._prepare_pooling(seq_group_metadata_list,
-                                                 model_input.seq_lens)
-
-        return dataclasses.replace(model_input,
-                                   virtual_engine=virtual_engine,
-                                   pooling_metadata=pooling_metadata)
-
-    def _prepare_pooling(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        prompt_lens: List[int],
-    ) -> PoolingMetadata:
-        """Prepare PoolingMetadata for the sequence group metadata list."""
-        seq_groups: List[Tuple[List[int], PoolingParams]] = []
-        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-            pooling_params = seq_group_metadata.pooling_params
-            seq_groups.append((seq_ids, pooling_params))
-
-        seq_data: Dict[int, SequenceData] = {}
-        for seq_group_metadata in seq_group_metadata_list:
-            seq_data.update(seq_group_metadata.seq_data)
-
-        pooling_metadata = PoolingMetadata(
-            seq_groups=seq_groups,
-            seq_data=seq_data,
-            prompt_lens=prompt_lens,
-        )
-
-        return pooling_metadata
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
deleted file mode 100644
index a8998127b60f3..0000000000000
--- a/vllm/worker/cpu_worker.py
+++ /dev/null
@@ -1,452 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""A CPU worker class."""
-import os
-from importlib import util
-from typing import List, Optional, Set, Tuple, Type
-
-import torch
-import torch.distributed
-
-import vllm.envs as envs
-from vllm.attention import get_attn_backend
-from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
-                         ParallelConfig, VllmConfig)
-from vllm.distributed import (ensure_model_parallel_initialized,
-                              init_distributed_environment)
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor import set_random_seed
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import bind_kv_cache
-from vllm.worker.cpu_enc_dec_model_runner import CPUEncoderDecoderModelRunner
-from vllm.worker.cpu_model_runner import CPUModelRunner, CPUModelRunnerBase
-from vllm.worker.cpu_pooling_model_runner import CPUPoolingModelRunner
-from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
-                                     WorkerInput)
-
-logger = init_logger(__name__)
-
-
-class CPUCacheEngine:
-    """Manages the KV cache for CPU backend.
-
-    This class is responsible for initializing and managing CPU KV
-    caches. It also provides methods for performing KV cache operations, such
-    as copying.
-    """
-
-    def __init__(self, cache_config: CacheConfig, model_config: ModelConfig,
-                 parallel_config: ParallelConfig,
-                 device_config: DeviceConfig) -> None:
-        assert device_config.device_type == "cpu"
-        self.cache_config = cache_config
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-
-        self.head_size = model_config.get_head_size()
-        self.num_layers = model_config.get_num_layers(parallel_config)
-        self.num_heads = model_config.get_num_kv_heads(parallel_config)
-
-        self.block_size = cache_config.block_size
-        # Note: In CacheConfig, num_gpu_blocks actual is num_cpu_blocks
-        # for CPU backend, because we want to reuse KV cache management
-        # in the scheduler.
-        self.num_cpu_blocks = cache_config.num_gpu_blocks
-
-        self.dtype = CPUCacheEngine.get_kv_cache_dtype(cache_config,
-                                                       model_config)
-
-        # Get attention backend.
-        self.attn_backend = get_attn_backend(
-            self.model_config.get_head_size(),
-            self.model_config.dtype,
-            cache_config.cache_dtype,
-            self.block_size,
-            self.model_config.is_attention_free,
-            use_mla=self.model_config.use_mla,
-        )
-
-        # Initialize the cache.
-        self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks)
-
-    def _allocate_kv_cache(
-        self,
-        num_blocks: int,
-    ) -> List[torch.Tensor]:
-        """Allocates KV cache on CPU."""
-        kv_cache_shape = self.attn_backend.get_kv_cache_shape(
-            num_blocks, self.block_size, self.num_heads, self.head_size)
-        kv_cache: List[torch.Tensor] = []
-        for _ in range(self.num_layers):
-            kv_cache.append(
-                torch.empty(kv_cache_shape, dtype=self.dtype, device="cpu"))
-        return kv_cache
-
-    def swap_in(self, src_to_dst: torch.Tensor) -> None:
-        raise NotImplementedError("Swap is not supported in CPUCacheEngine.")
-
-    def swap_out(self, src_to_dst: torch.Tensor) -> None:
-        raise NotImplementedError("Swap is not supported in CPUCacheEngine.")
-
-    def copy(self, src_to_dsts: torch.Tensor) -> None:
-        self.attn_backend.copy_blocks(self.cpu_cache, src_to_dsts)
-
-    @staticmethod
-    def get_kv_cache_dtype(cache_config: CacheConfig,
-                           model_config: ModelConfig):
-        if cache_config.cache_dtype == "auto":
-            return model_config.dtype
-        elif cache_config.cache_dtype in ["fp8", "fp8_e5m2"]:
-            return torch.float8_e5m2
-        else:
-            raise NotImplementedError(f"Unsupported KV cache type "
-                                      f"{cache_config.cache_dtype}.")
-
-    @staticmethod
-    def get_cache_block_size(
-        cache_config: CacheConfig,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-    ) -> int:
-        head_size = model_config.get_head_size()
-        num_heads = model_config.get_num_kv_heads(parallel_config)
-        num_layers = model_config.get_num_layers(parallel_config)
-
-        key_cache_block = cache_config.block_size * num_heads * head_size
-        value_cache_block = key_cache_block if not model_config.use_mla else 0
-        total = num_layers * (key_cache_block + value_cache_block)
-        dtype = CPUCacheEngine.get_kv_cache_dtype(cache_config, model_config)
-        dtype_size = torch.tensor([], dtype=dtype).element_size()
-        return dtype_size * total
-
-
-class CPUWorker(LocalOrDistributedWorkerBase):
-    """A worker class that executes (a partition of) the model on a CPU socket.
-
-    Each worker is associated with a single CPU socket. The worker is 
-    responsible for maintaining the KV cache and executing the model on the 
-    CPU. In case of distributed inference, each worker is assigned a partition
-    of the model.
-    """
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        local_rank: int,
-        rank: int,
-        distributed_init_method: str,
-        kv_cache_dtype: Optional[str] = "auto",
-        is_driver_worker: bool = False,
-        model_runner_cls: Optional[Type[CPUModelRunner]] = None,
-    ) -> None:
-        WorkerBase.__init__(self, vllm_config=vllm_config)
-
-        self.local_rank = local_rank
-        self.rank = rank
-        vllm_config.parallel_config.rank = rank
-
-        self.distributed_init_method = distributed_init_method
-
-        self.is_driver_worker = is_driver_worker
-        if self.is_driver_worker:
-            assert self.rank == 0, "The driver worker must have rank 0."
-
-        if self.model_config.trust_remote_code:
-            # note: lazy import to avoid importing torch before initializing
-            from vllm.utils import init_cached_hf_modules
-            init_cached_hf_modules()
-
-        # Setup OpenMP threads affinity.
-        omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
-        self.local_omp_cpuid = "all"
-        if omp_cpuids == "auto":
-            self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes(
-            )
-        else:
-            self.local_omp_cpuid = omp_cpuids.split("|")[rank]
-
-        # Return hidden states from target model if the draft model is an
-        # mlp_speculator
-        speculative_config = self.speculative_config
-        model_config = self.model_config
-        speculative_args = {} if speculative_config is None \
-            or (speculative_config.draft_model_config.model ==
-                model_config.model) \
-            or (speculative_config.draft_model_config.hf_config.model_type
-                not in ["medusa", "mlp_speculator", "eagle"]) \
-                    else {"return_hidden_states": True}
-        ModelRunnerClass: Type[CPUModelRunnerBase] = CPUModelRunner
-        if self.model_config.runner_type == "pooling":
-            ModelRunnerClass = CPUPoolingModelRunner
-        elif self.model_config.is_encoder_decoder:
-            ModelRunnerClass = CPUEncoderDecoderModelRunner
-        self.model_runner: CPUModelRunnerBase = ModelRunnerClass(
-            vllm_config=vllm_config,
-            kv_cache_dtype=kv_cache_dtype,
-            is_driver_worker=is_driver_worker,
-            **speculative_args,
-        )
-        if model_runner_cls is not None:
-            self.model_runner = model_runner_cls(self.model_runner)
-        # Uninitialized cache engine. Will be initialized by
-        # initialize_cache.
-        self.cache_engine: List[CPUCacheEngine]
-        # Initialize cpu_cache as pooling models don't initialize kv_caches
-        self.cpu_cache: Optional[List[List[torch.Tensor]]] = None
-
-        # Torch profiler. Enabled and configured through env vars:
-        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
-        if envs.VLLM_TORCH_PROFILER_DIR:
-            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
-            logger.info("Profiling enabled. Traces will be saved to: %s",
-                        torch_profiler_trace_dir)
-            self.profiler = torch.profiler.profile(
-                activities=[
-                    torch.profiler.ProfilerActivity.CPU,
-                ],
-                with_stack=True,
-                on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    torch_profiler_trace_dir, use_gzip=True))
-        else:
-            self.profiler = None
-
-    def start_profile(self):
-        if self.profiler is None:
-            raise RuntimeError("Profiler is not enabled.")
-        self.profiler.start()
-
-    def stop_profile(self):
-        if self.profiler is None:
-            raise RuntimeError("Profiler is not enabled.")
-        self.profiler.stop()
-
-    def init_device(self) -> None:
-        if self.local_omp_cpuid != "all":
-            ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
-            if ret:
-                logger.info(ret)
-
-        # Note: unique identifier for creating allreduce shared memory
-        os.environ["VLLM_DIST_IDENT"] = self.distributed_init_method.split(
-            ":")[-1]
-        self.device = torch.device("cpu")
-        self.init_distributed_environment()
-        # Set random seed.
-        set_random_seed(self.model_config.seed)
-
-    def load_model(self):
-        self.model_runner.load_model()
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of blocks available for the KV cache.
-
-        This determines how many KV blocks can fit into the configured CPU
-        KV cache space.
-
-        Note that since vLLM assumes a block resides on GPU if it can be
-        modified, we return num_gpu_blocks=num_cpu_blocks and num_cpu_blocks=0.
-        This allows us to reuse the scheduler of vLLM without generalizing it
-        to different devices.
-        """
-        # For CPU device, the block number will be calculated based on the
-        # cpu_kvcache_space.
-        cache_block_size = self.get_cache_block_size_bytes()
-        num_cpu_blocks = int(self.cache_config.cpu_kvcache_space_bytes //
-                             cache_block_size)
-        num_cpu_blocks = max(num_cpu_blocks, 0)
-
-        # Note: To reuse the cache management procedure,
-        # use cpu cache as 'gpu cache'.
-        num_gpu_blocks = num_cpu_blocks
-        num_cpu_blocks = 0
-        return num_gpu_blocks, num_cpu_blocks
-
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        """Initialize the KV cache. Currently, swappable CPU memory is not
-        supported.
-
-        Since this worker does not support GPUs, we use the num_gpu_blocks to
-        determine how many non-swappable CPU blocks to allocate.
-        """
-        assert (num_cpu_blocks == 0
-                ), f"{type(self)} does not support swappable cache"
-
-        # Note: To reuse the cache management procedure,
-        # use cpu cache as 'gpu cache'.
-        num_cpu_blocks = num_gpu_blocks
-
-        self._validate_num_cpu_blocks(num_cpu_blocks)
-        self.cache_config.num_gpu_blocks = num_cpu_blocks
-        self.cache_config.num_cpu_blocks = 0
-
-        # Initialize the cache.
-        self._init_cache_engine()
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        return self.model_runner.add_lora(lora_request)
-
-    def remove_lora(self, lora_id: int) -> bool:
-        return self.model_runner.remove_lora(lora_id)
-
-    def pin_lora(self, lora_id: int) -> bool:
-        return self.model_runner.pin_lora(lora_id)
-
-    def list_loras(self) -> Set[int]:
-        return self.model_runner.list_loras()
-
-    def _validate_num_cpu_blocks(self, num_cpu_blocks: int) -> None:
-        """Raise errors if the num_cpu_blocks is invalid.
-        """
-        if num_cpu_blocks <= 0:
-            raise ValueError("No available memory for the cache blocks. "
-                             "Try increasing `VLLM_CPU_KVCACHE_SPACE` when "
-                             "initializing the engine.")
-
-        max_seq_len = self.cache_config.block_size * num_cpu_blocks
-        if self.model_config.max_model_len > max_seq_len:
-            raise ValueError(
-                f"The model's max seq len ({self.model_config.max_model_len}) "
-                "is larger than the maximum number of tokens that can be "
-                f"stored in KV cache ({max_seq_len}). Try increasing "
-                "`VLLM_CPU_KVCACHE_SPACE` or decreasing `max_model_len` when "
-                "initializing the engine.")
-
-    def _init_cache_engine(self) -> None:
-        self.cache_engine = [
-            CPUCacheEngine(self.cache_config, self.model_config,
-                           self.parallel_config, self.device_config)
-            for _ in range(self.parallel_config.pipeline_parallel_size)
-        ]
-        self.cpu_cache = [
-            self.cache_engine[ve].cpu_cache
-            for ve in range(self.parallel_config.pipeline_parallel_size)
-        ]
-        bind_kv_cache(self.compilation_config.static_forward_context,
-                      self.cpu_cache)
-        self.model_runner.block_size = self.cache_engine[0].block_size
-
-        assert all(
-            self.cpu_cache[ve] is not None
-            for ve in range(self.parallel_config.pipeline_parallel_size))
-
-        # Populate the cache to warmup the memory
-        for ve in range(self.parallel_config.pipeline_parallel_size):
-            for layer_cache in self.cpu_cache[ve]:
-                layer_cache.fill_(0)
-
-    @property
-    def do_metadata_broadcast(self) -> bool:
-        return self.parallel_config.tensor_parallel_size > 1
-
-    @property
-    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
-        return self.cpu_cache
-
-    @property
-    def vocab_size(self) -> int:
-        return self.model_runner.vocab_size
-
-    @property
-    def max_model_len(self) -> int:
-        return self.model_config.max_model_len
-
-    def execute_worker(
-        self,
-        worker_input: WorkerInput,
-    ) -> None:
-        if (worker_input.blocks_to_copy is not None
-                and worker_input.blocks_to_copy.numel() > 0):
-            self.cache_engine[worker_input.virtual_engine].copy(
-                worker_input.blocks_to_copy)
-
-    @torch.inference_mode()
-    def prepare_worker_input(
-            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
-        assert execute_model_req is not None
-        virtual_engine: int = execute_model_req.virtual_engine
-        num_seq_groups: int = len(execute_model_req.seq_group_metadata_list)
-        blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
-                                      device="cpu",
-                                      dtype=torch.int64).view(-1, 2)
-        assert len(execute_model_req.blocks_to_swap_in) == 0
-        assert len(execute_model_req.blocks_to_swap_out) == 0
-        return WorkerInput(
-            num_seq_groups=num_seq_groups,
-            blocks_to_copy=blocks_to_copy,
-            virtual_engine=virtual_engine,
-        )
-
-    def init_distributed_environment(self) -> None:
-        """Initialize the distributed environment."""
-
-        parallel_config = self.parallel_config
-        rank = self.rank
-        distributed_init_method = self.distributed_init_method
-        init_distributed_environment(
-            world_size=parallel_config.world_size,
-            rank=rank,
-            distributed_init_method=distributed_init_method,
-            backend="gloo",
-        )
-
-        # A small all_reduce for warmup.
-        torch.distributed.all_reduce(torch.zeros(1).cpu())
-
-        ensure_model_parallel_initialized(
-            parallel_config.tensor_parallel_size,
-            parallel_config.pipeline_parallel_size)
-
-    def get_cache_block_size_bytes(self) -> int:
-        """Return the size in bytes of a single KV cache block.
-        """
-        return CPUCacheEngine.get_cache_block_size(self.cache_config,
-                                                   self.model_config,
-                                                   self.parallel_config)
-
-    def get_cpus_id_binding_based_on_numa_nodes(self) -> str:
-        """Return CPUs id binding based on NUMA nodes.
-        """
-        rank_to_cpus = self.local_omp_cpuid
-        # Setup OpenMP thread affinity based on NUMA nodes automatically
-        world_size = self.vllm_config.parallel_config.world_size
-        libnuma_found = util.find_spec("numa") is not None
-        psutil_found = util.find_spec("psutil") is not None
-        if libnuma_found and psutil_found:
-            import psutil
-            from numa import info
-            cpu_count = psutil.cpu_count(logical=False)
-            cpus_allow_list = psutil.Process().cpu_affinity()
-            numa_size = info.get_num_configured_nodes()
-            cpu_count_per_numa = cpu_count // numa_size
-            num_of_reserved_cpu = min(envs.VLLM_CPU_NUM_OF_RESERVED_CPU,
-                                      cpu_count_per_numa // 2)
-
-            # check allow node_to_cpus list
-            node_to_cpus = []
-            for i in range(numa_size):
-                node_intersect = set(
-                    info.node_to_cpus(i)).intersection(cpus_allow_list)
-                if bool(node_intersect):
-                    node_to_cpus.append(list(node_intersect))
-
-            if world_size > len(node_to_cpus):
-                logger.error(
-                    "Auto thread-binding failed due to "
-                    "world size: %d is larger than "
-                    "allowed NUMA nodes number: %d."
-                    "Please try to bind threads manually.", world_size,
-                    len(node_to_cpus))
-            else:
-                end = cpu_count_per_numa - num_of_reserved_cpu
-                rank_to_cpus_list = node_to_cpus[self.rank][:end]
-                rank_to_cpus = ','.join(str(x) for x in rank_to_cpus_list)
-                logger.info("auto thread-binding list: %s", rank_to_cpus)
-        else:
-            logger.warning(
-                "Auto thread-binding is not supported due to "
-                "the lack of package numa and psutil,"
-                "fallback to no thread-binding. To get better performance,"
-                "please try to manually bind threads.")
-        return rank_to_cpus
diff --git a/vllm/worker/multi_step_tpu_worker.py b/vllm/worker/multi_step_tpu_worker.py
deleted file mode 100644
index ed9f001666159..0000000000000
--- a/vllm/worker/multi_step_tpu_worker.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import dataclasses
-from typing import Dict, Optional, Tuple
-
-import torch
-
-from vllm.distributed import broadcast_tensor_dict
-from vllm.sequence import ExecuteModelRequest
-from vllm.worker.tpu_model_runner import ModelInputForTPU
-from vllm.worker.tpu_worker import TPUWorker
-from vllm.worker.worker_base import WorkerInput
-
-
-class MultiStepTPUWorker(TPUWorker):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.cached_model_input: Optional[ModelInputForTPU] = None
-
-    def _get_driver_input_and_broadcast(
-        self, execute_model_req: ExecuteModelRequest
-    ) -> Tuple[ModelInputForTPU, WorkerInput, Dict[str, torch.Tensor]]:
-        assert self.is_driver_worker
-        assert execute_model_req.virtual_engine == 0
-
-        is_first_multi_step = execute_model_req.is_first_multi_step
-        is_last_step = execute_model_req.is_last_step
-        if is_first_multi_step:
-            worker_input: WorkerInput = self.prepare_worker_input(
-                execute_model_req=execute_model_req)
-            worker_input = dataclasses.replace(
-                worker_input,
-                num_steps=execute_model_req.num_lookahead_slots + 1)
-            model_input: ModelInputForTPU = (
-                self.model_runner.prepare_model_input(
-                    execute_model_req.seq_group_metadata_list,
-                    execute_model_req.virtual_engine,
-                    execute_model_req.finished_requests_ids))
-
-            if execute_model_req.async_callback:
-                model_input = dataclasses.replace(
-                    model_input,
-                    async_callback=execute_model_req.async_callback)
-        else:
-            assert self.cached_model_input is not None
-            model_input = self.cached_model_input
-            worker_input = WorkerInput()
-        model_input = dataclasses.replace(
-            model_input,
-            is_first_multi_step=is_first_multi_step,
-            is_last_step=is_last_step)
-
-        if self.do_metadata_broadcast:
-            if is_first_multi_step:
-                broadcast_data = worker_input.as_broadcastable_tensor_dict()
-                broadcast_data.update(
-                    model_input.as_broadcastable_tensor_dict())
-                broadcast_tensor_dict(broadcast_data, src=0)
-            else:
-                broadcast_data = {
-                    "is_first_multi_step": is_first_multi_step,
-                    "is_last_step": is_last_step,
-                }
-                broadcast_tensor_dict(broadcast_data, src=0)
-
-        # Retuning empty dict here to keep this compatible with
-        # `LocalOrDistributedWorkerBase._get_driver_input_and_broadcast`
-        return model_input, worker_input, {}
-
-    def prepare_input(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None,
-    ) -> Optional[Tuple[ModelInputForTPU, WorkerInput, Dict[str,
-                                                            torch.Tensor]]]:
-        if self.is_driver_worker:
-            if execute_model_req is None:
-                if self.do_metadata_broadcast:
-                    broadcast_tensor_dict({}, src=0)
-                return None
-
-            model_input, worker_input, _ = self._get_driver_input_and_broadcast(
-                execute_model_req)
-            if model_input.is_first_multi_step:
-                self.cached_model_input = model_input
-            return model_input, worker_input, {}
-        else:
-            broadcast_data = broadcast_tensor_dict(src=0)
-            if not broadcast_data:
-                return None
-
-            if len(broadcast_data) == 2:
-                assert self.cached_model_input is not None
-                self.cached_model_input = dataclasses.replace(
-                    self.cached_model_input,
-                    is_first_multi_step=broadcast_data["is_first_multi_step"],
-                    is_last_step=broadcast_data["is_last_step"])
-                empty_worker_input = WorkerInput()
-                return self.cached_model_input, empty_worker_input, {}
-
-            worker_input = WorkerInput.from_broadcasted_tensor_dict(
-                broadcast_data)
-            model_input = (
-                self.model_runner.
-                make_model_input_from_broadcasted_tensor_dict(broadcast_data))
-            self.cached_model_input = model_input
-            return model_input, worker_input, {}
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
deleted file mode 100644
index 336bc0bcec363..0000000000000
--- a/vllm/worker/tpu_model_runner.py
+++ /dev/null
@@ -1,909 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import enum
-import time
-from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
-                    Type, Union)
-from unittest.mock import patch
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch_xla.core.xla_model as xm
-import torch_xla.runtime as xr
-
-from vllm.attention import AttentionMetadata, get_attn_backend
-from vllm.config import VllmConfig
-from vllm.forward_context import get_forward_context, set_forward_context
-from vllm.logger import init_logger
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.model_executor.model_loader import get_model
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors,
-                           Logprob, SequenceGroupMetadata, SequenceOutput)
-from vllm.worker.model_runner_base import (
-    ModelRunnerBase, ModelRunnerInputBase,
-    _add_attn_metadata_broadcastable_dict,
-    _init_attn_metadata_from_tensor_dict)
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
-logger = init_logger(__name__)
-
-# Here we utilize the behavior that out-of-bound index is ignored.
-# FIXME(woosuk): Find a more reliable way to prevent possible bugs.
-_PAD_SLOT_ID = 1_000_000_000
-# FIXME(woosuk): Temporarily disabled top-p sampling since it's too slow.
-_ENABLE_TOP_P = False
-# FIXME(woosuk): A temporary hack to support `n > 1`.
-# This can significantly affect the performance if too large.
-_MAX_NUM_SAMPLES = 128
-
-
-class ExecutionMode(enum.Enum):
-    PREFILL = enum.auto()
-    DECODE = enum.auto()
-    PREFIX_PREFILL = enum.auto()
-
-    def is_prefill(self) -> bool:
-        return self in (ExecutionMode.PREFILL, ExecutionMode.PREFIX_PREFILL)
-
-
-@dataclass(frozen=True)
-class ModelInputForTPU(ModelRunnerInputBase):
-    token_ids: torch.Tensor
-    position_ids: torch.Tensor
-    attn_metadata: AttentionMetadata
-    input_lens: torch.Tensor
-    t: torch.Tensor
-    p: torch.Tensor
-    num_samples: int
-    n: List[int]
-    seq_groups: List[List[int]]
-    is_first_multi_step: bool = True
-    is_last_step: bool = True
-    virtual_engine: int = 0
-    async_callback: Optional[Callable] = None
-
-    def as_broadcastable_tensor_dict(
-            self) -> Dict[str, Union[int, torch.Tensor]]:
-        tensor_dict = {
-            "token_ids": self.token_ids,
-            "position_ids": self.position_ids,
-            "input_lens": self.input_lens,
-            "t": self.t,
-            "p": self.p,
-            "num_samples": self.num_samples,
-            "n": self.n,
-            "seq_groups": self.seq_groups,
-            "is_first_multi_step": self.is_first_multi_step,
-            "is_last_step": self.is_last_step,
-            "virtual_engine": self.virtual_engine,
-        }
-        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
-        return tensor_dict
-
-    @classmethod
-    def from_broadcasted_tensor_dict(
-        cls: Type["ModelInputForTPU"],
-        tensor_dict: Dict[str, Any],
-        attn_backend: Optional["AttentionBackend"] = None,
-    ) -> "ModelInputForTPU":
-        if attn_backend is not None:
-            tensor_dict = _init_attn_metadata_from_tensor_dict(
-                attn_backend, tensor_dict)
-        return cls(**tensor_dict)
-
-
-class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]):
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        is_driver_worker: bool = False,
-    ):
-        ModelRunnerBase.__init__(self, vllm_config=vllm_config)
-        self.is_driver_worker = is_driver_worker
-
-        self.block_size = self.cache_config.block_size
-        self.max_num_blocks_per_seq = (self.model_config.max_model_len //
-                                       self.block_size)
-        self.block_tables = np.zeros(
-            (self.scheduler_config.max_num_seqs, self.max_num_blocks_per_seq),
-            dtype=np.int32)
-        self.attn_backend = get_attn_backend(
-            self.model_config.get_head_size(),
-            self.model_config.dtype,
-            self.cache_config.cache_dtype,
-            self.block_size,
-            self.model_config.is_attention_free,
-            False,
-        )
-        self.cached_step_outputs: List[torch.Tensor] = []
-
-        smem_size = 512 * 1024
-        block_table_size = 4 * self.block_tables.size
-        if block_table_size >= smem_size:
-            logger.warning(
-                "The max_model_len (%d) is too large. This may degrade the "
-                "performance due to the insufficient smem size. Consider "
-                "setting --max-model-len to a smaller value, like %d.",
-                self.model_config.max_model_len,
-                self.model_config.max_model_len /
-                (block_table_size / smem_size))
-
-    def load_model(self) -> None:
-        self.device = self.device_config.device
-
-        # NOTE(woosuk): While the executor assigns the TP ranks to the worker
-        # process, the ranks can be different from the ranks internally assigned
-        # by the xm runtime. Therefore, there is a mismatch in the rank
-        # assignment between the gloo (cpu) runtime and the xm (tpu) runtime.
-        # This is not a problem in linear layers because all-reduce is
-        # rank-agnostic. However, it matters for all-gather as the ranks
-        # determine the order of concatenating the output tensors.
-        # As a workaround, we use the xm's rank assignment only when loading
-        # the embedding weights.
-        xm_tp_rank = xr.global_ordinal()
-        with patch(
-                "vllm.model_executor.layers.vocab_parallel_embedding."
-                "get_tensor_model_parallel_rank",
-                return_value=xm_tp_rank):
-            model = get_model(vllm_config=self.vllm_config)
-        model = model.eval()
-        xm.wait_device_ops()
-        model = ModelWrapper(model)
-        self.model = torch.compile(model,
-                                   backend="openxla",
-                                   fullgraph=True,
-                                   dynamic=False)
-
-    def get_model(self) -> nn.Module:
-        return self.model.model
-
-    def _dummy_run(
-        self,
-        batch_size: int,
-        seq_len: int,
-        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
-        exec_mode: ExecutionMode,
-    ) -> None:
-        exec_mode = ExecutionMode(exec_mode)
-        if exec_mode.is_prefill():
-            seq_len = (seq_len + 15) // 16 * 16
-            token_ids = torch.zeros((batch_size, seq_len),
-                                    dtype=torch.int32,
-                                    device=self.device)
-            position_ids = torch.zeros((batch_size, seq_len),
-                                       dtype=torch.int32,
-                                       device=self.device)
-            slot_mapping = torch.zeros((batch_size, seq_len),
-                                       dtype=torch.int64,
-                                       device=self.device)
-            input_lens = torch.ones((batch_size, ),
-                                    dtype=torch.int32,
-                                    device=self.device)
-            if exec_mode == ExecutionMode.PREFILL:
-                attn_metadata = self.attn_backend.make_metadata(
-                    num_prefills=batch_size,
-                    num_prefill_tokens=batch_size * seq_len,
-                    num_decode_tokens=0,
-                    slot_mapping=slot_mapping,
-                    multi_modal_placeholder_index_maps=None,
-                    enable_kv_scales_calculation=False,
-                    block_tables=None,
-                    context_lens=None,
-                    effective_query_lens=None,
-                )
-            else:
-                context_lens = torch.ones((batch_size, ),
-                                          dtype=torch.int32,
-                                          device=self.device)
-                block_tables = torch.tensor(self.block_tables[:batch_size],
-                                            dtype=torch.int32,
-                                            device=self.device)
-                effective_query_lens = torch.ones_like(context_lens)
-                attn_metadata = self.attn_backend.make_metadata(
-                    num_prefills=batch_size,
-                    num_prefill_tokens=batch_size * seq_len,
-                    num_decode_tokens=0,
-                    slot_mapping=slot_mapping,
-                    multi_modal_placeholder_index_maps=None,
-                    enable_kv_scales_calculation=False,
-                    block_tables=block_tables,
-                    context_lens=context_lens,
-                    effective_query_lens=effective_query_lens,
-                )
-        else:
-            assert seq_len == 1
-            token_ids = torch.zeros((batch_size, seq_len),
-                                    dtype=torch.int32,
-                                    device=self.device)
-            position_ids = torch.zeros((batch_size, seq_len),
-                                       dtype=torch.int32,
-                                       device=self.device)
-            slot_mapping = torch.zeros((batch_size, seq_len),
-                                       dtype=torch.int64,
-                                       device=self.device)
-            block_tables = torch.zeros(
-                (batch_size, self.max_num_blocks_per_seq),
-                dtype=torch.int32,
-                device=self.device)
-            context_lens = torch.ones((batch_size, ),
-                                      dtype=torch.int32,
-                                      device=self.device)
-            input_lens = torch.ones((batch_size, ),
-                                    dtype=torch.int32,
-                                    device=self.device)
-            attn_metadata = self.attn_backend.make_metadata(
-                num_prefills=0,
-                num_prefill_tokens=0,
-                num_decode_tokens=batch_size * seq_len,
-                slot_mapping=slot_mapping,
-                multi_modal_placeholder_index_maps=None,
-                enable_kv_scales_calculation=False,
-                block_tables=block_tables,
-                context_lens=context_lens,
-            )
-        t = torch.ones((batch_size, ), dtype=torch.float32, device=self.device)
-        p = torch.ones((batch_size, ), dtype=torch.float32, device=self.device)
-        num_samples = _MAX_NUM_SAMPLES if exec_mode.is_prefill() else 1
-
-        # NOTE(woosuk): There are two stages of compilation: torch.compile and
-        # XLA compilation. Using `mark_dynamic` can reduce the torch.compile
-        # overhead by reusing the FX graph for different shapes.
-        # However, the XLA graph will still require static shapes and needs to
-        # be re-compiled for every different shapes. This overhead is inevitable
-        # in the first run, but can be skipped afterwards as we cache the XLA
-        # graphs in the disk (VLLM_XLA_CACHE_PATH).
-        if exec_mode.is_prefill():
-            # Prefll
-            torch._dynamo.mark_dynamic(token_ids, 1)
-            torch._dynamo.mark_dynamic(position_ids, 1)
-            torch._dynamo.mark_dynamic(attn_metadata.slot_mapping, 1)
-        else:
-            # Decode
-            torch._dynamo.mark_dynamic(token_ids, 0)
-            torch._dynamo.mark_dynamic(position_ids, 0)
-            torch._dynamo.mark_dynamic(input_lens, 0)
-            torch._dynamo.mark_dynamic(attn_metadata.slot_mapping, 0)
-            torch._dynamo.mark_dynamic(attn_metadata.context_lens, 0)
-            torch._dynamo.mark_dynamic(attn_metadata.block_tables, 0)
-            torch._dynamo.mark_dynamic(t, 0)
-            torch._dynamo.mark_dynamic(p, 0)
-        # Dummy run.
-        with set_forward_context(attn_metadata, self.vllm_config, 0):
-            self.model(token_ids, position_ids, input_lens, t, p, num_samples,
-                       kv_caches)
-
-    def warmup_model(
-        self,
-        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
-    ) -> None:
-        # Prefill
-        logger.info("Compiling the model with different input shapes...")
-        start = time.time()
-        for batch_size in [1]:
-            seq_len = 16
-            while seq_len <= self.model_config.max_model_len:
-                self._dummy_run(batch_size,
-                                seq_len,
-                                kv_caches,
-                                exec_mode=ExecutionMode.PREFILL)
-                xm.wait_device_ops()
-                logger.info("batch_size: %d, seq_len: %d", batch_size, seq_len)
-                num_tokens = batch_size * seq_len
-                if num_tokens >= self.scheduler_config.max_num_batched_tokens:
-                    break
-                seq_len = seq_len * 2
-
-        end = time.time()
-        logger.info("Compilation for prefill done in %.2f s.", end - start)
-
-        # Prefix prefill
-        if self.cache_config.enable_prefix_caching:
-            logger.info("Compiling the model with different input shapes for "
-                        "prefix prefill...")
-            start = time.time()
-            for batch_size in [1]:
-                seq_len = 16
-                while seq_len <= self.model_config.max_model_len:
-                    self._dummy_run(batch_size,
-                                    seq_len,
-                                    kv_caches,
-                                    exec_mode=ExecutionMode.PREFIX_PREFILL)
-                    xm.wait_device_ops()
-                    logger.info("batch_size: %d, seq_len: %d", batch_size,
-                                seq_len)
-                    num_tokens = batch_size * seq_len
-                    if (num_tokens
-                            >= self.scheduler_config.max_num_batched_tokens):
-                        break
-                    seq_len = seq_len * 2
-            end = time.time()
-            logger.info("Compilation for prefix prefill done in %.2f s.",
-                        end - start)
-
-        # Decode
-        start = time.time()
-        seq_len = 1
-        batch_size = 8  # Must be in sync with _get_padded_batch_size()
-        while True:
-            self._dummy_run(batch_size,
-                            seq_len,
-                            kv_caches,
-                            exec_mode=ExecutionMode.DECODE)
-            xm.wait_device_ops()
-            logger.info("batch_size: %d, seq_len: %d", batch_size, seq_len)
-
-            if batch_size >= self.scheduler_config.max_num_seqs:
-                break
-            batch_size = batch_size + 16 if batch_size >= 16 else batch_size * 2
-
-        end = time.time()
-        logger.info("Compilation for decode done in %.2f s.", end - start)
-
-    def _prepare_prompt(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, torch.Tensor]:
-        assert len(seq_group_metadata_list) > 0
-        input_tokens: List[int] = []
-        input_positions: List[int] = []
-        prompt_lens: List[int] = []
-        context_lens: List[int] = []
-        slot_mapping: List[int] = []
-
-        for batch_idx, seq_group_metadata in enumerate(
-                seq_group_metadata_list):
-            assert seq_group_metadata.is_prompt
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-            assert len(seq_ids) == 1
-            seq_id = seq_ids[0]
-
-            seq_data = seq_group_metadata.seq_data[seq_id]
-            # Could include output tokens when a request is preempted.
-            prompt_tokens = seq_data.get_token_ids()
-            seq_len = len(prompt_tokens)
-
-            num_computed_blocks = len(seq_group_metadata.computed_block_nums)
-            num_computed_tokens = num_computed_blocks * self.block_size
-            if num_computed_tokens > 0:
-                prompt_tokens = prompt_tokens[num_computed_tokens:]
-                context_lens.append(seq_len)
-            else:
-                context_lens.append(0)
-
-            prompt_len = len(prompt_tokens)
-            prompt_lens.append(prompt_len)
-
-            input_tokens.extend(prompt_tokens)
-            input_positions.extend(range(num_computed_tokens, seq_len))
-
-            assert seq_group_metadata.block_tables is not None
-            block_table = seq_group_metadata.block_tables[seq_id]
-            for i in range(num_computed_tokens, seq_len):
-                block_number = block_table[i // self.block_size]
-                block_offset = i % self.block_size
-                slot = block_number * self.block_size + block_offset
-                slot_mapping.append(slot)
-            if num_computed_tokens > 0:
-                self.block_tables[batch_idx, :len(block_table)] = block_table
-
-            # Add paddings to EACH prompt to the smallest power of 2 that is
-            # greater than or equal to the prompt length.
-            # We pad the seq_len to reduce the compilation overhead.
-            # We execute each prompt individually (i.e., with batch_size 1)
-            # because the FlashAttention kernel does not support ragged inputs.
-            # TODO(woosuk): Use SplashAttention to support ragged inputs.
-            padded_prompt_len = _get_padded_prefill_len(prompt_len)
-            num_paddings = padded_prompt_len - prompt_len
-            input_tokens += [0] * num_paddings
-            input_positions += [0] * num_paddings
-            slot_mapping += [_PAD_SLOT_ID] * num_paddings
-
-        assert len(prompt_lens) > 0
-        num_prefills = len(prompt_lens)
-        input_tokens = torch.tensor(input_tokens,
-                                    dtype=torch.int32,
-                                    device="cpu")
-        input_positions = torch.tensor(input_positions,
-                                       dtype=torch.int32,
-                                       device="cpu")
-        slot_mapping = torch.tensor(slot_mapping,
-                                    dtype=torch.int64,
-                                    device="cpu")
-        prompt_lens = torch.tensor(prompt_lens,
-                                   dtype=torch.int32,
-                                   device="cpu")
-        context_lens = torch.tensor(context_lens,
-                                    dtype=torch.int32,
-                                    device="cpu")
-        block_tables = torch.tensor(self.block_tables[:num_prefills],
-                                    dtype=torch.int32,
-                                    device="cpu")
-        attn_metadata = self.attn_backend.make_metadata(
-            num_prefills=num_prefills,
-            num_prefill_tokens=0,  # NOTE: This is not used.
-            num_decode_tokens=0,
-            slot_mapping=slot_mapping,
-            multi_modal_placeholder_index_maps=None,
-            enable_kv_scales_calculation=False,
-            block_tables=block_tables,
-            context_lens=context_lens,
-            effective_query_lens=prompt_lens,
-        )
-        return input_tokens, input_positions, attn_metadata, prompt_lens
-
-    def _prepare_decode(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, torch.Tensor]:
-        assert len(seq_group_metadata_list) > 0
-        input_tokens: List[List[int]] = []
-        input_positions: List[List[int]] = []
-        slot_mapping: List[List[int]] = []
-        context_lens: List[int] = []
-
-        batch_idx = 0
-        for seq_group_metadata in seq_group_metadata_list:
-            assert not seq_group_metadata.is_prompt
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-            for seq_id in seq_ids:
-                seq_data = seq_group_metadata.seq_data[seq_id]
-                generation_token = seq_data.get_last_token_id()
-                input_tokens.append([generation_token])
-
-                seq_len = seq_data.get_len()
-                position = seq_len - 1
-                input_positions.append([position])
-                context_lens.append(seq_len)
-
-                assert seq_group_metadata.block_tables is not None
-                block_table = seq_group_metadata.block_tables[seq_id]
-                self.block_tables[batch_idx, :len(block_table)] = block_table
-                batch_idx += 1
-
-                block_number = block_table[position // self.block_size]
-                block_offset = position % self.block_size
-                slot = block_number * self.block_size + block_offset
-                slot_mapping.append([slot])
-
-        batch_size = _get_padded_batch_size(batch_idx)
-        num_paddings = batch_size - batch_idx
-        input_tokens = input_tokens + [[0]] * num_paddings
-        input_positions = input_positions + [[0]] * num_paddings
-        slot_mapping = slot_mapping + [[_PAD_SLOT_ID]] * num_paddings
-        context_lens = context_lens + [0] * num_paddings
-
-        input_tokens = torch.tensor(input_tokens,
-                                    dtype=torch.int32,
-                                    device="cpu")
-        input_positions = torch.tensor(input_positions,
-                                       dtype=torch.int32,
-                                       device="cpu")
-        slot_mapping = torch.tensor(slot_mapping,
-                                    dtype=torch.int64,
-                                    device="cpu")
-        context_lens = torch.tensor(context_lens,
-                                    dtype=torch.int32,
-                                    device="cpu")
-        block_tables = torch.tensor(self.block_tables[:batch_size],
-                                    dtype=torch.int32,
-                                    device="cpu")
-        input_lens = torch.tensor([1] * batch_size,
-                                  dtype=torch.int32,
-                                  device="cpu")
-        attn_metadata = self.attn_backend.make_metadata(
-            num_prefills=0,
-            num_prefill_tokens=0,
-            num_decode_tokens=batch_size,
-            slot_mapping=slot_mapping,
-            multi_modal_placeholder_index_maps=None,
-            enable_kv_scales_calculation=False,
-            block_tables=block_tables,
-            context_lens=context_lens,
-        )
-        return input_tokens, input_positions, attn_metadata, input_lens
-
-    def _prepare_sample(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        padded_batch_size: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor, List[int]]:
-        assert len(seq_group_metadata_list) > 0
-        t = []
-        p = []
-        n = []
-        for seq_group_metadata in seq_group_metadata_list:
-            sampling_params = seq_group_metadata.sampling_params
-            t.append(sampling_params.temperature)
-            if sampling_params.top_p != 1 and not _ENABLE_TOP_P:
-                raise NotImplementedError(
-                    "Top-p sampling is currently disabled for the TPU backend "
-                    "due to performance issues.")
-            p.append(sampling_params.top_p)
-            if sampling_params.top_k > 0:
-                raise NotImplementedError(
-                    "Top-k sampling is currently disabled for the TPU backend "
-                    "due to performance issues.")
-            if sampling_params.n > _MAX_NUM_SAMPLES:
-                raise NotImplementedError(
-                    f"Best of > {_MAX_NUM_SAMPLES} is not supported by the TPU "
-                    "backend.")
-            n.append(sampling_params.n)
-            if sampling_params.logprobs is not None:
-                raise NotImplementedError(
-                    "logprobs is not currently supported by the TPU backend.")
-            if sampling_params.prompt_logprobs is not None:
-                raise NotImplementedError(
-                    "prompt_logprobs is not currently supported by the TPU "
-                    "backend.")
-
-            # Repeat the sampling params if the seq group has multiple seqs.
-            num_seqs = len(seq_group_metadata.seq_data)
-            t += [t[-1]] * (num_seqs - 1)
-            p += [p[-1]] * (num_seqs - 1)
-            n += [n[-1]] * (num_seqs - 1)
-
-        num_paddings = padded_batch_size - len(t)
-        t += [1.0] * num_paddings
-        p += [1.0] * num_paddings
-
-        t = torch.tensor(t, dtype=torch.float32, device="cpu")
-        p = torch.tensor(p, dtype=torch.float32, device="cpu")
-        return t, p, n
-
-    def prepare_model_input(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        virtual_engine: int = 0,
-        finished_requests_ids: Optional[List[str]] = None,
-    ) -> ModelInputForTPU:
-        del finished_requests_ids  # Unused.
-        assert virtual_engine == 0
-        assert len(seq_group_metadata_list) > 0
-        # NOTE: We assume that all sequences in the group are all prompts or
-        # all decodes.
-        is_prompt = seq_group_metadata_list[0].is_prompt
-        if is_prompt:
-            inputs = self._prepare_prompt(seq_group_metadata_list)
-        else:
-            inputs = self._prepare_decode(seq_group_metadata_list)
-        input_tokens, input_positions, attn_metadata, input_lens = inputs
-        padded_batch_size = input_tokens.shape[0]
-        t, p, n = self._prepare_sample(seq_group_metadata_list,
-                                       padded_batch_size)
-        num_samples = _MAX_NUM_SAMPLES if is_prompt else 1
-
-        seq_groups = [
-            list(metadata.seq_data.keys())
-            for metadata in seq_group_metadata_list
-        ]
-        return ModelInputForTPU(input_tokens, input_positions, attn_metadata,
-                                input_lens, t, p, num_samples, n, seq_groups)
-
-    def make_model_input_from_broadcasted_tensor_dict(
-            self, tensor_dict: Dict[str, Any]) -> ModelInputForTPU:
-        model_input = ModelInputForTPU.from_broadcasted_tensor_dict(
-            tensor_dict, attn_backend=self.attn_backend)
-        return model_input
-
-    @torch.no_grad()
-    def execute_model(
-        self,
-        model_input: ModelInputForTPU,
-        kv_caches: Optional[List[Any]],
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-    ) -> List[SamplerOutput]:
-        assert intermediate_tensors is None
-        if not model_input.is_first_multi_step:
-            if not model_input.is_last_step:
-                return []
-
-            use_async_out_proc = model_input.async_callback is not None
-            sampler_outputs = []
-            num_outputs = len(self.cached_step_outputs)
-            for i in range(num_outputs):
-                next_token_ids = self.cached_step_outputs.pop(0)
-                next_token_ids = next_token_ids.cpu().tolist()
-                sampler_output = _make_decode_output(next_token_ids,
-                                                     model_input.seq_groups)
-                sampler_outputs.append(sampler_output)
-
-                if i < num_outputs - 1 and use_async_out_proc:
-                    assert model_input.async_callback is not None
-                    ctx = model_input.async_callback.keywords[  # type: ignore
-                        "ctx"]
-                    ctx.append_output(
-                        outputs=[sampler_output],
-                        seq_group_metadata_list=ctx.seq_group_metadata_list,
-                        scheduler_outputs=ctx.scheduler_outputs,
-                        is_async=False,
-                        is_last_step=False,
-                        is_first_step_output=i == 0)
-                    model_input.async_callback()
-            if use_async_out_proc:
-                return [sampler_outputs[-1]]
-            else:
-                return sampler_outputs
-
-        is_prompt = model_input.attn_metadata.num_prefills > 0
-        if is_prompt:
-            assert num_steps == 1
-            # NOTE(woosuk): Since the FlashAttention kernel does not support
-            # ragged inputs, we split the prompts into different batches and
-            # process them separately. This is a temporary hack that should be
-            # optimized by using SplashAttention.
-            orig_slot_mapping = model_input.attn_metadata.slot_mapping
-            orig_block_tables = model_input.attn_metadata.block_tables
-            orig_context_lens = model_input.attn_metadata.context_lens
-            orig_effective_query_lens = \
-                model_input.attn_metadata.effective_query_lens
-            batch_size = model_input.input_lens.shape[0]
-            start_idx = 0
-            next_token_ids = []
-            for i in range(batch_size):
-                # Get the actual prefill_len.
-                prefill_len = model_input.input_lens[i:i + 1].item()
-                prefill_len = _get_padded_prefill_len(prefill_len)
-                end_idx = start_idx + prefill_len
-
-                token_ids = model_input.token_ids[None, start_idx:end_idx].to(
-                    self.device)
-                position_ids = model_input.position_ids[None,
-                                                        start_idx:end_idx].to(
-                                                            self.device)
-                attn_metadata = model_input.attn_metadata
-                attn_metadata.num_prefills = 1
-                attn_metadata.slot_mapping = orig_slot_mapping[
-                    None, start_idx:end_idx].to(self.device)
-                if orig_context_lens[i].item() > 0:
-                    attn_metadata.context_lens = orig_context_lens[i:i + 1].to(
-                        self.device)
-                    attn_metadata.block_tables = orig_block_tables[
-                        i].unsqueeze(0).to(self.device)
-                    attn_metadata.effective_query_lens = \
-                        orig_effective_query_lens[i:i + 1].to(self.device)
-                else:
-                    attn_metadata.context_lens = None
-                    attn_metadata.block_tables = None
-                    attn_metadata.effective_query_lens = None
-                input_lens = model_input.input_lens[i:i + 1].to(self.device)
-                t = model_input.t[i:i + 1].to(self.device)
-                p = model_input.p[i:i + 1].to(self.device)
-                with set_forward_context(model_input.attn_metadata,
-                                         self.vllm_config,
-                                         model_input.virtual_engine):
-                    output_token_ids = self.model(token_ids, position_ids,
-                                                  input_lens, t, p,
-                                                  model_input.num_samples,
-                                                  kv_caches)
-                next_token_ids.append(output_token_ids[0])
-                start_idx = end_idx
-
-            if model_input.async_callback is not None:
-                model_input.async_callback()
-            # Retrieve the outputs to CPU.
-            next_token_ids = [
-                output_token_ids.cpu().tolist()
-                for output_token_ids in next_token_ids
-            ]
-
-            # NOTE(woosuk): Minimal code to construct the sampler outputs.
-            # The TPU backend does not reuse the sampler, since the TPU backend
-            # does not support advanced sampling parameters such as logprobs.
-            zero_logprob = Logprob(0.0)
-            sampler_outputs = []
-            for i, seq_group in enumerate(model_input.seq_groups):
-                seq_ids = seq_group
-                assert len(seq_ids) == 1
-                seq_id = seq_ids[0]
-                seq_outputs = []
-                for j in range(model_input.n[i]):
-                    next_token_id = next_token_ids[i][j]
-                    seq_outputs.append(
-                        SequenceOutput(seq_id, next_token_id,
-                                       {next_token_id: zero_logprob}))
-                sampler_outputs.append(
-                    CompletionSequenceGroupOutput(seq_outputs, None))
-            return [SamplerOutput(sampler_outputs)]
-        else:
-            token_ids = model_input.token_ids.to(self.device)
-            position_ids = model_input.position_ids.to(self.device)
-            attn_metadata = model_input.attn_metadata
-            attn_metadata.slot_mapping = attn_metadata.slot_mapping.to(
-                self.device)
-            attn_metadata.block_tables = attn_metadata.block_tables.to(
-                self.device)
-            attn_metadata.context_lens = attn_metadata.context_lens.to(
-                self.device)
-            t = model_input.t.to(self.device)
-            p = model_input.p.to(self.device)
-            input_lens = model_input.input_lens.to(self.device)
-            for i in range(num_steps):
-                slot_mapping = attn_metadata.slot_mapping
-                with set_forward_context(model_input.attn_metadata,
-                                         self.vllm_config,
-                                         model_input.virtual_engine):
-                    output_token_ids = self.model(token_ids, position_ids,
-                                                  input_lens, t, p,
-                                                  model_input.num_samples,
-                                                  kv_caches)
-                self.cached_step_outputs.append(output_token_ids)
-
-                if i < num_steps - 1:
-                    # Prepare the inputs for the next step.
-                    token_ids = output_token_ids.unsqueeze(dim=1).int()
-                    position_ids = position_ids + 1
-                    attn_metadata.context_lens = attn_metadata.context_lens + 1
-
-                    block_tables = attn_metadata.block_tables
-                    block_number = block_tables.gather(
-                        1,
-                        position_ids.long() // self.block_size)
-                    block_offset = position_ids % self.block_size
-
-                    is_padding = slot_mapping == _PAD_SLOT_ID
-                    slot_mapping = block_number * self.block_size + block_offset
-                    slot_mapping = slot_mapping.long()
-                    slot_mapping = torch.where(is_padding, _PAD_SLOT_ID,
-                                               slot_mapping)
-                    attn_metadata.slot_mapping = slot_mapping
-
-            if model_input.async_callback is not None:
-                model_input.async_callback()
-
-            if num_steps > 1:
-                return []
-            # Retrieve the outputs to CPU.
-            next_token_ids = self.cached_step_outputs.pop(0)
-            next_token_ids = next_token_ids.cpu().tolist()
-            sampler_output = _make_decode_output(next_token_ids,
-                                                 model_input.seq_groups)
-            return [sampler_output]
-
-
-class ModelWrapper(nn.Module):
-
-    def __init__(self, model: nn.Module):
-        super().__init__()
-        self.model = model
-
-    def forward(
-        self,
-        token_ids: torch.Tensor,
-        position_ids: torch.Tensor,
-        input_lens: torch.Tensor,
-        t: torch.Tensor,
-        p: torch.Tensor,
-        num_samples: int,
-        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
-    ) -> torch.Tensor:
-        """Executes the forward pass of the model and samples the next token.
-
-        Args:
-            token_ids: The input token IDs of shape [batch_size, seq_len].
-            position_ids: The input position IDs of shape [batch_size, seq_len].
-            input_lens: The actual input lengths of shape [batch_size].
-            t: The sampling temperature of shape [batch_size].
-            p: The top-p probability of shape [batch_size].
-            num_samples: Number of samples to draw from each logits vector.
-            kv_caches: The key and value caches. They can be None during the
-                memory profiling at initialization.
-        """
-        batch_size, seq_len = token_ids.shape
-        # Calculate the positions to sample from.
-        start_indices = torch.arange(
-            batch_size, dtype=torch.int32, device=input_lens.device) * seq_len
-        logits_indices = start_indices + input_lens - 1
-        attn_metadata = get_forward_context().attn_metadata
-
-        # FIXME(woosuk): This is a temporary hack to avoid using the existing
-        # sampler and sampling metadata.
-        sampling_metadata = SamplingMetadata(
-            seq_groups=[],
-            selected_token_indices=logits_indices,
-            categorized_sample_indices={},
-            num_prompts=attn_metadata.num_prefills,
-        )
-
-        # Skip this in memory profiling at initialization.
-        if kv_caches[0][0].numel() > 0:
-            # index_copy_(slot_mapping) only works when the inserted dimension
-            # is 0. However, the KV cache in the Pallas backend has the shape
-            # [num_kv_heads, num_blocks, block_size, head_size]. To make it
-            # work, we need to flatten the first three dimensions and modify
-            # the slot_mapping accordingly.
-            num_kv_heads, num_blocks, block_size, _ = kv_caches[0][0].shape
-            slot_mapping = attn_metadata.slot_mapping
-            slot_mapping = slot_mapping.flatten()
-            head_indices = torch.arange(0,
-                                        num_kv_heads,
-                                        device=slot_mapping.device,
-                                        dtype=slot_mapping.dtype)
-            head_indices *= block_size * num_blocks
-            slot_mapping = slot_mapping.repeat_interleave(num_kv_heads).view(
-                -1, num_kv_heads)
-            slot_mapping = slot_mapping + head_indices.view(1, -1)
-            slot_mapping = slot_mapping.flatten()
-            attn_metadata.slot_mapping = slot_mapping
-
-        hidden_states = self.model(token_ids, position_ids)
-        hidden_states = hidden_states.flatten(0, 1)
-        logits = self.model.compute_logits(hidden_states, sampling_metadata)
-
-        # Argmax sampling.
-        argmax_token_ids = torch.argmax(logits, dim=-1, keepdim=True)
-        argmax_token_ids = argmax_token_ids.repeat(1, num_samples)
-
-        # Zero temperature means greedy decoding. Avoid division by zero.
-        nonzero_t = torch.where(t != 0, t, 1.0)
-        logits = logits / nonzero_t.unsqueeze(dim=1)
-        if _ENABLE_TOP_P:
-            logits = _apply_top_p(logits, p.unsqueeze(dim=1))
-
-        # Random sampling.
-        probs = torch.softmax(logits, dim=-1, dtype=torch.float32)
-        sampled_token_ids = torch.multinomial(probs,
-                                              num_samples,
-                                              replacement=True)
-        if num_samples == 1:
-            argmax_token_ids = argmax_token_ids.squeeze(dim=-1)
-            sampled_token_ids = sampled_token_ids.squeeze(dim=-1)
-        next_token_ids = torch.where(t != 0, sampled_token_ids,
-                                     argmax_token_ids)
-        return next_token_ids
-
-
-def _get_padded_prefill_len(x: int) -> int:
-    # NOTE(woosuk): The pallas FlashAttention kernel requires the sequence
-    # length to be a multiple of 16. We pad the prompt length to the nearest
-    # multiple of 16. This is also good for performance.
-    if x <= 16:
-        return 16
-    return 1 << (x - 1).bit_length()
-
-
-def _get_padded_batch_size(batch_size: int) -> int:
-    # The GMM Pallas kernel requires num_tokens * topk to be a multiple of 16.
-    # To meet this requirement in the simplest way, we set the minimal batch
-    # size to 8.
-    if batch_size <= 8:
-        return 8
-    else:
-        return ((batch_size + 15) // 16) * 16
-
-
-def _apply_top_p(logits: torch.Tensor, p: torch.Tensor) -> torch.Tensor:
-    logits_sorted = torch.sort(logits, dim=-1, descending=True).values
-    sorted_cum_probs = torch.cumsum(logits_sorted.softmax(dim=-1), dim=-1)
-    cutoff_index = torch.sum(sorted_cum_probs < p, dim=-1, keepdim=True)
-    cutoff_logit = torch.gather(logits_sorted, -1, cutoff_index)
-    logits = logits.masked_fill_(logits < cutoff_logit, -float("inf"))
-    return logits
-
-
-def _make_decode_output(
-    next_token_ids: List[int],
-    seq_groups: List[List[int]],
-) -> SamplerOutput:
-    zero_logprob = Logprob(0.0)
-    sampler_outputs = []
-    batch_idx = 0
-    for seq_group in seq_groups:
-        seq_ids = seq_group
-        seq_outputs = []
-        for seq_id in seq_ids:
-            next_token_id = next_token_ids[batch_idx]
-            seq_outputs.append(
-                SequenceOutput(seq_id, next_token_id,
-                               {next_token_id: zero_logprob}))
-            batch_idx += 1
-        sampler_outputs.append(CompletionSequenceGroupOutput(
-            seq_outputs, None))
-    return SamplerOutput(sampler_outputs)
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
deleted file mode 100644
index ad5ed19e2f894..0000000000000
--- a/vllm/worker/tpu_worker.py
+++ /dev/null
@@ -1,337 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch_xla.core.xla_model as xm
-import torch_xla.debug.profiler as xp
-import torch_xla.runtime as xr
-
-import vllm.envs as envs
-from vllm.config import VllmConfig
-from vllm.distributed import (ensure_model_parallel_initialized,
-                              init_distributed_environment)
-from vllm.logger import init_logger
-from vllm.model_executor import set_random_seed
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, bind_kv_cache, get_dtype_size
-from vllm.worker.tpu_model_runner import ExecutionMode, TPUModelRunner
-from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
-                                     LoRANotSupportedWorkerBase, WorkerBase,
-                                     WorkerInput)
-
-logger = init_logger(__name__)
-
-
-class TPUWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase):
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        local_rank: int,
-        rank: int,
-        distributed_init_method: str,
-        is_driver_worker: bool,
-    ) -> None:
-        WorkerBase.__init__(self, vllm_config=vllm_config)
-        self.parallel_config.rank = rank
-        self.local_rank = local_rank
-        self.rank = rank
-        self.distributed_init_method = distributed_init_method
-        self.is_driver_worker = is_driver_worker
-
-        assert self.device_config.device_type == "tpu"
-        if self.cache_config.cache_dtype == "auto":
-            self.cache_dtype = self.model_config.dtype
-        else:
-            self.cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
-                self.cache_config.cache_dtype]
-
-        self.model_runner: TPUModelRunner = TPUModelRunner(
-            vllm_config=vllm_config, is_driver_worker=is_driver_worker)
-
-        if self.model_config.seed is None:
-            self.model_config.seed = 0
-
-        if vllm_config.lora_config is not None:
-            raise NotImplementedError(
-                "The V0 TPU backend doesn't support LoRA serving")
-
-    def init_device(self) -> None:
-        os.environ["PJRT_DEVICE"] = "TPU"
-        torch.set_grad_enabled(False)
-        torch.set_default_dtype(self.model_config.dtype)
-
-        # NOTE(woosuk): This is just to initialize the TP group and broadcast
-        # the input objects on CPU. The all-reduce and all-gather ops on TPU
-        # are invoked by `xm.all_reduce` and `xm.all_gather` which use their
-        # own context.
-        init_distributed_environment(
-            world_size=self.parallel_config.world_size,
-            rank=self.rank,
-            local_rank=self.local_rank,
-            distributed_init_method=self.distributed_init_method,
-            backend="gloo",
-        )
-        ensure_model_parallel_initialized(
-            self.parallel_config.tensor_parallel_size,
-            self.parallel_config.pipeline_parallel_size)
-
-        # Device initialization should happen after initializing the distributed
-        # runtime.
-        self.device = xm.xla_device()
-        self.device_config.device = self.device
-
-        # Set random seed.
-        set_random_seed(self.model_config.seed)
-        xm.set_rng_state(self.model_config.seed, self.device)
-
-        # Increase the cache size limit, which is the maximum number of
-        # dynamo graphs that can be compiled.
-        # NOTE(woosuk): Usually, we compile 10-15 graphs for prefill and
-        # 30-40 graphs for decode. 128 is an arbitrary safe number.
-        torch._dynamo.config.cache_size_limit = 128
-        # Use persistent cache to avoid XLA recompilation.
-        # NOTE(woosuk): Set per-rank cache path since different ranks
-        # can have slightly different XLA graphs.
-        world_size = self.parallel_config.world_size
-        rank = xr.global_ordinal()
-        # The PyTorch/XLA compilation cache uses the Torch IR to generate keys.
-        # Consequently, changes in optimization flags, which affect compilation
-        # results, don't change the cache key. This can result in the wrong
-        # compilation being used. To prevent this, disabling the XLA compilation
-        # cache during development is recommended.We can disable it by
-        # `export VLLM_XLA_CACHE_PATH=`
-        if envs.VLLM_XLA_CACHE_PATH:
-            per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH,
-                                         f"tp{world_size}_rank{rank}")
-            xr.initialize_cache(per_rank_path, readonly=False)
-
-        self.profiler = None
-        if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1:
-            # For TPU, we can only have 1 active profiler session for 1 profiler
-            # server. So we only profile on rank0.
-            self.profile_dir = envs.VLLM_TORCH_PROFILER_DIR
-            logger.info("Profiling enabled. Traces will be saved to: %s",
-                        self.profile_dir)
-            self.profiler = xp.start_server(9012)
-
-    def start_profile(self):
-        if self.rank < 1:
-            if self.profiler is None:
-                raise RuntimeError("Profiler is not enabled.")
-            xp.start_trace(self.profile_dir)
-
-    def stop_profile(self):
-        if self.rank < 1:
-            if self.profiler is None:
-                raise RuntimeError("Profiler is not enabled.")
-            xp.stop_trace()
-
-    def load_model(self):
-        self.model_runner.load_model()
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        num_layers = self.model_config.get_num_layers(self.parallel_config)
-        head_size = self.model_config.get_head_size()
-        num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
-
-        # use an empty tensor instead of `None`` to force Dynamo to pass
-        # it by reference, rather by specializing on the value ``None``.
-        # the `dtype` argument does not matter, and we use `float32` as
-        # a placeholder (it has wide hardware support).
-        kv_caches = [(torch.tensor([], dtype=torch.float32,
-                                   device=self.device),
-                      torch.tensor([], dtype=torch.float32,
-                                   device=self.device))
-                     for _ in range(num_layers)]
-        bind_kv_cache(self.compilation_config.static_forward_context,
-                      [kv_caches])
-        self.model_runner._dummy_run(
-            batch_size=1,
-            seq_len=self.scheduler_config.max_num_batched_tokens,
-            kv_caches=kv_caches,
-            exec_mode=ExecutionMode.PREFILL,
-        )
-        # Synchronize before measuring the memory usage.
-        xm.wait_device_ops()
-
-        # Get the maximum amount of memory used by the model weights and
-        # intermediate activations.
-        m = xm.get_memory_info(self.device)
-        total_memory_size = m["bytes_limit"]
-        profiled = m["peak_bytes_used"]  # Weights + intermediate activations.
-
-        # Calculate the TPU KV cache size based on profiling.
-        usable_memory_size = int(total_memory_size *
-                                 self.cache_config.gpu_memory_utilization)
-        tpu_kv_cache_bytes = max(usable_memory_size - profiled, 0)
-        dtype_bytes = get_dtype_size(self.cache_dtype)
-        block_size_bytes = (dtype_bytes * self.cache_config.block_size *
-                            num_layers * 2 * head_size * num_kv_heads)
-        num_tpu_blocks = tpu_kv_cache_bytes // block_size_bytes
-        num_tpu_blocks = (num_tpu_blocks // 8) * 8  # Round down to 8.
-
-        # Calculate the CPU KV cache size based on the config.
-        num_cpu_blocks = int(self.cache_config.swap_space_bytes //
-                             block_size_bytes)
-        num_cpu_blocks = (num_cpu_blocks // 8) * 8  # Round down to 8.
-        return num_tpu_blocks, num_cpu_blocks
-
-    def initialize_cache(
-        self,
-        num_gpu_blocks: int,
-        num_cpu_blocks: int,
-    ) -> None:
-        self.cache_config.num_gpu_blocks = num_gpu_blocks
-        self.cache_config.num_cpu_blocks = num_cpu_blocks
-        self.block_size = self.cache_config.block_size
-
-        dtype = self.cache_dtype
-        num_layers = self.model_config.get_num_layers(self.parallel_config)
-        num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
-        head_size = self.model_config.get_head_size()
-
-        self.cpu_cache: List[Tuple[torch.Tensor, torch.Tensor]] = []
-        self.tpu_cache: List[Tuple[torch.Tensor, torch.Tensor]] = []
-        tpu_cache_shape = self.model_runner.attn_backend.get_kv_cache_shape(
-            num_gpu_blocks, self.block_size, num_kv_heads, head_size)
-        cpu_cache_shape = self.model_runner.attn_backend.get_kv_cache_shape(
-            num_cpu_blocks, self.block_size, num_kv_heads, head_size)
-        for _ in range(num_layers):
-            tpu_k_cache = torch.zeros(tpu_cache_shape,
-                                      dtype=dtype,
-                                      device=self.device)
-            tpu_v_cache = torch.zeros_like(tpu_k_cache)
-            self.tpu_cache.append((tpu_k_cache, tpu_v_cache))
-            cpu_k_cache = torch.zeros(cpu_cache_shape,
-                                      dtype=dtype,
-                                      device="cpu")
-            cpu_v_cache = torch.zeros_like(cpu_k_cache)
-            self.cpu_cache.append((cpu_k_cache, cpu_v_cache))
-        bind_kv_cache(self.compilation_config.static_forward_context,
-                      [self.tpu_cache])
-        self._warmup_model()
-
-    def _warmup_model(self) -> None:
-        # FIXME(woosuk): Here we are abusing `enforce_eager` which is defined
-        # for CUDA graphs. We should refactor this part.
-        if not self.model_config.enforce_eager:
-            # Warm up the model with all possible input shapes so that
-            # compilation never happens during the actual execution.
-            # This may take ~30 mins for the first run and ~20 mins for the
-            # subsequent runs.
-            # If `enforce_eager` is True, the ahead-of-time compilation is
-            # skipped and the compilation happens during the actual execution,
-            # which is bad for performance but useful for development.
-            self.model_runner.warmup_model(self.tpu_cache)
-
-    def get_cache_block_size_bytes(self) -> int:
-        head_size = self.model_config.get_head_size()
-        num_heads = self.model_config.get_num_kv_heads(self.parallel_config)
-        num_layers = self.model_config.get_num_layers(self.parallel_config)
-
-        key_cache_block = self.cache_config.block_size * num_heads * head_size
-        value_cache_block = key_cache_block
-        total = num_layers * (key_cache_block + value_cache_block)
-        dtype_size = get_dtype_size(self.cache_dtype)
-        return dtype_size * total
-
-    @property
-    def do_metadata_broadcast(self) -> bool:
-        return self.parallel_config.tensor_parallel_size > 1
-
-    @property
-    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
-        # NOTE(woosuk): This assumes virtual_engine == 0, i.e., no pipeline
-        # parallelism.
-        return [self.tpu_cache]
-
-    def prepare_worker_input(
-        self,
-        execute_model_req: ExecuteModelRequest,
-    ) -> WorkerInput:
-        virtual_engine = execute_model_req.virtual_engine
-        num_seq_groups = len(execute_model_req.seq_group_metadata_list)
-        blocks_to_swap_in = _make_src_to_dst(
-            execute_model_req.blocks_to_swap_in, "cpu", self.device)
-        blocks_to_swap_out = _make_src_to_dst(
-            execute_model_req.blocks_to_swap_out, self.device, "cpu")
-        blocks_to_copy = _make_src_to_dst(execute_model_req.blocks_to_copy,
-                                          self.device, self.device)
-        return WorkerInput(
-            num_seq_groups=num_seq_groups,
-            blocks_to_swap_in=blocks_to_swap_in,
-            blocks_to_swap_out=blocks_to_swap_out,
-            blocks_to_copy=blocks_to_copy,
-            virtual_engine=virtual_engine,
-        )
-
-    def execute_worker(self, worker_input: WorkerInput) -> None:
-        virtual_engine = worker_input.virtual_engine
-        assert virtual_engine == 0
-        attn_backend = self.model_runner.attn_backend
-        num_layers = self.model_config.get_num_layers(self.parallel_config)
-
-        # Issue cache operations.
-        if worker_input.blocks_to_swap_in is not None:
-            src_indices, dst_indices = worker_input.blocks_to_swap_in
-            if src_indices.numel() > 0:
-                # Swap from CPU to TPU.
-                for i in range(num_layers):
-                    tpu_k_cache, tpu_v_cache = self.tpu_cache[i]
-                    cpu_k_cache, cpu_v_cache = self.cpu_cache[i]
-                    k = cpu_k_cache[:, src_indices].to(self.device)
-                    v = cpu_v_cache[:, src_indices].to(self.device)
-                    _insert_kv(k, v, dst_indices, tpu_k_cache, tpu_v_cache)
-
-        if worker_input.blocks_to_swap_out is not None:
-            src_indices, dst_indices = worker_input.blocks_to_swap_out
-            if src_indices.numel() > 0:
-                # Swap from TPU to CPU.
-                for i in range(num_layers):
-                    tpu_k_cache, tpu_v_cache = self.tpu_cache[i]
-                    cpu_k_cache, cpu_v_cache = self.cpu_cache[i]
-                    cpu_k_cache[:, dst_indices] = tpu_k_cache[:, src_indices]
-                    cpu_v_cache[:, dst_indices] = tpu_v_cache[:, src_indices]
-
-        if worker_input.blocks_to_copy is not None:
-            src_indices, dst_indices = worker_input.blocks_to_copy
-            if src_indices.numel() > 0:
-                attn_backend.copy_blocks(self.tpu_cache,
-                                         (src_indices, dst_indices))
-
-
-def _make_src_to_dst(
-    mapping: List[Tuple[int, int]],
-    src_device: Union[torch.device, str],
-    dst_device: Union[torch.device, str],
-) -> Optional[Tuple[torch.Tensor, torch.Tensor]]:
-    if not mapping:
-        return None
-
-    src_indices = [i for i, _ in mapping]
-    dst_indices = [i for _, i in mapping]
-    src_indices = torch.tensor(src_indices,
-                               device=src_device,
-                               dtype=torch.int64)
-    dst_indices = torch.tensor(dst_indices,
-                               device=dst_device,
-                               dtype=torch.int64)
-    return src_indices, dst_indices
-
-
-@torch.compile(backend="openxla")
-def _insert_kv(
-    k: torch.Tensor,
-    v: torch.Tensor,
-    indices: torch.Tensor,
-    tpu_k_cache: torch.Tensor,
-    tpu_v_cache: torch.Tensor,
-) -> None:
-    torch.ops.xla.dynamo_set_buffer_donor_(tpu_k_cache, True)
-    torch.ops.xla.dynamo_set_buffer_donor_(tpu_v_cache, True)
-    tpu_k_cache[:, indices] = k
-    tpu_v_cache[:, indices] = v
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
deleted file mode 100644
index b2d3ce8526d51..0000000000000
--- a/vllm/worker/xpu_model_runner.py
+++ /dev/null
@@ -1,606 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import dataclasses
-import time
-import weakref
-from collections import defaultdict
-from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
-                    Type, TypeVar)
-
-import torch
-import torch.nn as nn
-
-from vllm.attention import get_attn_backend
-from vllm.config import VllmConfig
-from vllm.distributed import get_pp_group
-from vllm.forward_context import set_forward_context
-from vllm.inputs import INPUT_REGISTRY, InputRegistry
-from vllm.logger import init_logger
-from vllm.model_executor import SamplingMetadataCache
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
-from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalKwargs, MultiModalPlaceholderMap,
-                             MultiModalRegistry)
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
-from vllm.utils import DeviceMemoryProfiler, GiB_bytes, make_tensor_with_pad
-from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata
-from vllm.worker.model_runner_base import (
-    ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
-    _add_attn_metadata_broadcastable_dict,
-    _add_sampling_metadata_broadcastable_dict,
-    _init_attn_metadata_from_tensor_dict,
-    _init_sampling_metadata_from_tensor_dict)
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
-logger = init_logger(__name__)
-
-_PAD_SLOT_ID = -1
-
-TModelInputForXPU = TypeVar('TModelInputForXPU', bound="ModelInputForXPU")
-
-
-@dataclass(frozen=True)
-class ModelInputForXPU(ModelRunnerInputBase):
-    """
-    Used by the NeuronModelRunner.
-    """
-    input_tokens: Optional[torch.Tensor] = None
-    input_positions: Optional[torch.Tensor] = None
-    attn_metadata: Optional["AttentionMetadata"] = None
-    multi_modal_kwargs: Optional[BatchedTensorInputs] = None
-    virtual_engine: Optional[int] = None
-    seq_lens: Optional[List[int]] = None
-    query_lens: Optional[List[int]] = None
-    async_callback: Optional[Callable] = None
-
-    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
-        tensor_dict = {
-            "input_tokens": self.input_tokens,
-            "input_positions": self.input_positions,
-        }
-        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
-
-        return tensor_dict
-
-    @classmethod
-    def from_broadcasted_tensor_dict(
-        cls: Type[TModelInputForXPU],
-        tensor_dict: Dict[str, Any],
-        attn_backend: Optional["AttentionBackend"] = None,
-    ) -> TModelInputForXPU:
-        if attn_backend is not None:
-            tensor_dict = _init_attn_metadata_from_tensor_dict(
-                attn_backend, tensor_dict)
-        return cls(**tensor_dict)
-
-
-@dataclass(frozen=True)
-class ModelInputForXPUWithSamplingMetadata(ModelInputForXPU):
-    """
-    Used by the ModelRunner.
-    """
-    sampling_metadata: Optional["SamplingMetadata"] = None
-
-    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
-        tensor_dict = {
-            "input_tokens": self.input_tokens,
-            "input_positions": self.input_positions,
-        }
-        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
-        _add_sampling_metadata_broadcastable_dict(tensor_dict,
-                                                  self.sampling_metadata)
-        return tensor_dict
-
-    @classmethod
-    def from_broadcasted_tensor_dict(
-        cls,
-        tensor_dict: Dict[str, Any],
-        attn_backend: Optional["AttentionBackend"] = None,
-    ) -> "ModelInputForXPUWithSamplingMetadata":
-        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
-        if attn_backend is not None:
-            tensor_dict = _init_attn_metadata_from_tensor_dict(
-                attn_backend, tensor_dict)
-        return cls(**tensor_dict)
-
-
-class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]):
-
-    def __init__(self,
-                 runner: "XPUModelRunner",
-                 finished_requests_ids: Optional[List[str]] = None) -> None:
-        super().__init__()
-        self.runner = runner
-        self.model_input_cls = self.runner._model_input_cls
-        self.attn_backend = self.runner.attn_backend
-        self.sliding_window = self.runner.sliding_window
-        self.block_size = self.runner.block_size
-        self.device = self.runner.device
-
-    def prepare(self,
-                finished_requests_ids: Optional[List[str]] = None) -> None:
-        self.seq_group_metadata_list: List[SequenceGroupMetadata] = []
-
-    def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata):
-        self.seq_group_metadata_list.append(seq_group_metadata)
-
-    def build(self) -> ModelInputForXPU:
-        is_prompt = self.seq_group_metadata_list[0].is_prompt
-        # Prepare input tensors.
-        if is_prompt:
-            (input_tokens, input_positions, attn_metadata, seq_lens,
-             multi_modal_kwargs) = self._prepare_prompt(
-                 self.seq_group_metadata_list)
-        else:
-            (input_tokens, input_positions,
-             attn_metadata) = self._prepare_decode(
-                 self.seq_group_metadata_list)
-            seq_lens = None
-            multi_modal_kwargs = None
-
-        return self.model_input_cls(
-            input_tokens=input_tokens,
-            input_positions=input_positions,
-            attn_metadata=attn_metadata,
-            multi_modal_kwargs=multi_modal_kwargs,
-            seq_lens=seq_lens,
-            query_lens=seq_lens,
-        )
-
-    def _prepare_prompt(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
-               BatchedTensorInputs]:
-        assert len(seq_group_metadata_list) > 0
-        input_tokens: List[int] = []
-        input_positions: List[int] = []
-        slot_mapping: List[int] = []
-        seq_lens: List[int] = []
-        multi_modal_kwargs_list: List[MultiModalKwargs] = []
-        multi_modal_placeholder_maps: Dict[
-            str,
-            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
-
-        for seq_group_metadata in seq_group_metadata_list:
-            assert seq_group_metadata.is_prompt
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-            assert len(seq_ids) == 1
-            seq_id = seq_ids[0]
-
-            seq_data = seq_group_metadata.seq_data[seq_id]
-            prompt_tokens = seq_data.get_token_ids()
-            computed_len = seq_data.get_num_computed_tokens()
-            seq_len = len(prompt_tokens)
-
-            seq_lens.append(seq_len)  # Prompt token num
-            input_tokens.extend(prompt_tokens)  # Token ids
-
-            # Token position ids
-            # NOTE(woosuk): Here we assume that the first token in the prompt
-            # is always the first token in the sequence.
-            positions_range = range(computed_len, seq_len)
-            input_positions.extend(list(positions_range))
-
-            if seq_group_metadata.multi_modal_data:
-                # NOTE: mm_kwargs only includes the subset of multi-modal items
-                # that intersect with the current prefill positions.
-                mm_kwargs, placeholder_maps = MultiModalPlaceholderMap \
-                    .from_seq_group(seq_group_metadata, positions_range)
-
-                multi_modal_kwargs_list.append(mm_kwargs)
-
-                for modality, placeholder_map in placeholder_maps.items():
-                    multi_modal_placeholder_maps[modality].extend(
-                        placeholder_map)
-
-            if seq_group_metadata.block_tables is None:
-                # During memory profiling, the block tables are not initialized
-                # yet. In this case, we just use a dummy slot mapping.
-                slot_mapping.extend([_PAD_SLOT_ID] * seq_len)
-                continue
-
-            # Compute the slot mapping.
-            block_table = seq_group_metadata.block_tables[seq_id]
-            # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID,
-            # where start_idx is max(0, seq_len - sliding_window).
-            # For example, if the prompt len is 10, sliding window is 8, and
-            # block size is 4, the first two tokens are masked and the slot
-            # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
-            start_idx = 0
-            if self.sliding_window is not None:
-                start_idx = max(0, seq_len - self.sliding_window)
-
-            for i in range(computed_len, seq_len):
-                if i < start_idx:
-                    slot_mapping.append(_PAD_SLOT_ID)
-                    continue
-
-                block_number = block_table[i //
-                                           self.block_size]  # type: ignore
-                block_offset = i % self.block_size  # type: ignore
-                slot = block_number * self.block_size + block_offset
-                slot_mapping.append(slot)
-
-        num_prompt_tokens = len(input_tokens)
-
-        input_tokens = torch.tensor(input_tokens,
-                                    dtype=torch.long,
-                                    device=self.device)  # type: ignore
-        input_positions = torch.tensor(input_positions,
-                                       dtype=torch.long,
-                                       device=self.device)  # type: ignore
-        slot_mapping = torch.tensor(slot_mapping,
-                                    dtype=torch.long,
-                                    device=self.device)  # type: ignore
-        placeholder_index_maps = {
-            modality: placeholder_map.index_map()
-            for modality, placeholder_map in
-            multi_modal_placeholder_maps.items()
-        }
-
-        max_seqlen = max(seq_lens)
-        tmp = [0]
-        tmp.extend(seq_lens)
-        seqlen = torch.tensor(tmp)
-        seqlen_q = torch.cumsum(seqlen, dim=0).to(device=self.device)
-
-        attn_metadata = self.attn_backend.make_metadata(
-            is_prompt=True,
-            slot_mapping=slot_mapping,
-            multi_modal_placeholder_index_maps=placeholder_index_maps,
-            enable_kv_scales_calculation=False,
-            seq_lens=seq_lens,
-            seqlen_q=seqlen_q,
-            max_seqlen=max_seqlen,
-            seq_lens_tensor=torch.tensor([]),
-            max_decode_seq_len=0,
-            num_prefills=len(seq_lens),
-            num_prefill_tokens=num_prompt_tokens,
-            num_decode_tokens=0,
-            block_tables=torch.tensor([], device=self.device, dtype=torch.int),
-        )
-
-        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
-
-        return (input_tokens, input_positions, attn_metadata, seq_lens,
-                multi_modal_kwargs)
-
-    def _prepare_decode(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata]:
-        assert len(seq_group_metadata_list) > 0
-        input_tokens: List[int] = []
-        input_positions: List[int] = []
-        slot_mapping: List[int] = []
-        seq_lens: List[int] = []
-        block_tables: List[List[int]] = []
-
-        for seq_group_metadata in seq_group_metadata_list:
-            assert not seq_group_metadata.is_prompt
-            assert seq_group_metadata.token_chunk_size == 1
-
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-
-            for seq_id in seq_ids:
-                seq_data = seq_group_metadata.seq_data[seq_id]
-                generation_token = seq_data.get_last_token_id()
-                input_tokens.append(generation_token)
-
-                seq_len = seq_data.get_len()
-                position = seq_len - 1
-                input_positions.append(position)
-
-                seq_len = seq_len if self.sliding_window is None else min(
-                    seq_len, self.sliding_window)
-                seq_lens.append(seq_len)
-
-                block_table = seq_group_metadata.block_tables[seq_id]
-                block_number = block_table[position // self.block_size]
-                block_offset = position % self.block_size
-                slot = block_number * self.block_size + block_offset
-                slot_mapping.append(slot)
-
-                if self.sliding_window is not None:
-                    sliding_window_blocks = (self.sliding_window //
-                                             self.block_size)
-                    block_table = block_table[-sliding_window_blocks:]
-                block_tables.append(block_table)
-
-        max_decode_seq_len = max(seq_lens)
-
-        input_tokens = torch.tensor(input_tokens,
-                                    dtype=torch.long,
-                                    device=self.device)
-        input_positions = torch.tensor(input_positions,
-                                       dtype=torch.long,
-                                       device=self.device)
-        slot_mapping = torch.tensor(slot_mapping,
-                                    dtype=torch.long,
-                                    device=self.device)
-        seq_lens_tensor = torch.tensor(seq_lens,
-                                       dtype=torch.int,
-                                       device=self.device)
-
-        block_tables = make_tensor_with_pad(
-            block_tables,
-            pad=0,
-            dtype=torch.int,
-            device=self.device,
-        )
-
-        attn_metadata = self.attn_backend.make_metadata(
-            is_prompt=False,
-            slot_mapping=slot_mapping,
-            multi_modal_placeholder_index_maps=None,
-            enable_kv_scales_calculation=False,
-            seq_lens=seq_lens,
-            seqlen_q=torch.tensor([]),
-            max_seqlen=0,
-            seq_lens_tensor=seq_lens_tensor,
-            max_decode_seq_len=max_decode_seq_len,
-            num_prefill_tokens=0,
-            num_decode_tokens=len(input_tokens),
-            num_prefills=0,
-            block_tables=block_tables,
-        )
-        return (
-            input_tokens,
-            input_positions,
-            attn_metadata,
-        )
-
-
-class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
-    _model_input_cls: Type[ModelInputForXPUWithSamplingMetadata] = (
-        ModelInputForXPUWithSamplingMetadata)
-    _builder_cls: Type[ModelInputForXPUBuilder] = ModelInputForXPUBuilder
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        kv_cache_dtype: Optional[str] = "auto",
-        is_driver_worker: bool = False,
-        return_hidden_states: bool = False,
-        input_registry: InputRegistry = INPUT_REGISTRY,
-        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
-    ):
-
-        ModelRunnerBase.__init__(self, vllm_config=vllm_config)
-        model_config = self.model_config
-        cache_config = self.cache_config
-        self.is_driver_worker = is_driver_worker
-        self.return_hidden_states = return_hidden_states
-
-        self.device = self.device_config.device
-
-        self.kv_cache_dtype = kv_cache_dtype
-        self.sliding_window = model_config.get_sliding_window()
-        self.block_size = cache_config.block_size
-
-        self.attn_backend = get_attn_backend(
-            self.model_config.get_head_size(),
-            self.model_config.dtype,
-            self.kv_cache_dtype,
-            self.block_size,
-            self.model_config.is_attention_free,
-        )
-
-        # Multi-modal data support
-        self.input_registry = input_registry
-        self.mm_registry = mm_registry
-
-        # Lazy initialization.
-        self.model: nn.Module  # Set after init_Model
-        self.sampler = get_sampler()
-
-        self.sampling_metadata_cache: SamplingMetadataCache = \
-              SamplingMetadataCache() \
-                if self.parallel_config.pipeline_parallel_size == 1 else None
-
-        self.builder = self._builder_cls(weakref.proxy(self))
-
-    def load_model(self) -> None:
-        with DeviceMemoryProfiler() as m:
-            self.model = get_model(vllm_config=self.vllm_config)
-
-        self.model_memory_usage = m.consumed_memory
-        logger.info("Loading model weights took %.4f GiB",
-                    self.model_memory_usage / GiB_bytes)
-
-    def get_model(self) -> nn.Module:
-        return self.model
-
-    @property
-    def vocab_size(self) -> int:
-        return self.model_config.get_vocab_size()
-
-    @torch.inference_mode()
-    def profile_run(self) -> None:
-        # Enable top-k sampling to reflect the accurate memory usage.
-        sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
-        max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
-        max_num_seqs = self.scheduler_config.max_num_seqs
-
-        # Profile memory usage with max_num_sequences sequences and the total
-        # number of tokens equal to max_num_batched_tokens.
-        seqs: List[SequenceGroupMetadata] = []
-        # Additional GPU memory may be needed for multi-modal encoding, which
-        # needs to be accounted for when calculating the GPU blocks for
-        # vLLM blocker manager.
-        # To exercise the worst scenario for GPU memory consumption,
-        # the number of seqs (batch_size) is chosen to maximize the number
-        # of images processed.
-        max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
-            self.model_config)
-        if max_mm_tokens > 0:
-            max_num_seqs_orig = max_num_seqs
-            max_num_seqs = min(max_num_seqs,
-                               max_num_batched_tokens // max_mm_tokens)
-            if max_num_seqs < 1:
-                expr = (f"min({max_num_seqs_orig}, "
-                        f"{max_num_batched_tokens} // {max_mm_tokens})")
-                logger.warning(
-                    "Computed max_num_seqs (%s) to be less than 1. "
-                    "Setting it to the minimum value of 1.", expr)
-                max_num_seqs = 1
-
-        batch_size = 0
-        for group_id in range(max_num_seqs):
-            seq_len = (max_num_batched_tokens // max_num_seqs +
-                       (group_id < max_num_batched_tokens % max_num_seqs))
-            batch_size += seq_len
-
-            dummy_data = self.input_registry \
-                .dummy_data_for_profiling(self.model_config,
-                                          seq_len,
-                                          self.mm_registry)
-
-            seq = SequenceGroupMetadata(
-                request_id=str(group_id),
-                is_prompt=True,
-                seq_data={group_id: dummy_data.seq_data},
-                sampling_params=sampling_params,
-                block_tables=None,
-                lora_request=None,
-                multi_modal_data=dummy_data.multi_modal_data,
-                multi_modal_placeholders=dummy_data.multi_modal_placeholders)
-            seqs.append(seq)
-
-        finished_requests_ids = [seq.request_id for seq in seqs]
-        model_input = self.prepare_model_input(
-            seqs, finished_requests_ids=finished_requests_ids)
-        intermediate_tensors = None
-        if not get_pp_group().is_first_rank:
-            intermediate_tensors = self.model.make_empty_intermediate_tensors(
-                batch_size=batch_size,
-                dtype=self.model_config.dtype,
-                device=self.device)
-        self.execute_model(model_input, None, intermediate_tensors)
-        torch.xpu.synchronize()
-        return
-
-    def make_model_input_from_broadcasted_tensor_dict(
-            self,
-            tensor_dict: Dict[str,
-                              Any]) -> ModelInputForXPUWithSamplingMetadata:
-        return (
-            ModelInputForXPUWithSamplingMetadata.from_broadcasted_tensor_dict(
-                tensor_dict,
-                attn_backend=self.attn_backend,
-            ))
-
-    def _prepare_model_input_tensors(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        finished_requests_ids: Optional[List[str]] = None
-    ) -> ModelInputForXPUWithSamplingMetadata:
-        """Helper method to prepare the model input based on a given sequence
-        group. Prepares metadata needed for the base model forward pass but not
-        metadata for possible additional steps, e.g., sampling.
-
-        """
-        builder = self.builder
-        builder.prepare(finished_requests_ids)
-        for seq_group_metadata in seq_group_metadata_list:
-            builder.add_seq_group(seq_group_metadata)
-
-        return builder.build()  # type: ignore
-
-    def prepare_model_input(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        virtual_engine: int = 0,
-        finished_requests_ids: Optional[List[str]] = None
-    ) -> ModelInputForXPUWithSamplingMetadata:
-        """Prepare the model input based on a given sequence group, including
-        metadata for the sampling step.
-
-        """
-        model_input = self._prepare_model_input_tensors(
-            seq_group_metadata_list, finished_requests_ids)
-        # Sampling metadata is only required for the final pp group
-        generators = self.get_generators(finished_requests_ids)
-        sampling_metadata = SamplingMetadata.prepare(
-            seq_group_metadata_list,
-            model_input.seq_lens,
-            model_input.query_lens,
-            self.device,
-            pin_memory=False,
-            generators=generators,
-            cache=self.sampling_metadata_cache)
-
-        return dataclasses.replace(model_input,
-                                   sampling_metadata=sampling_metadata,
-                                   virtual_engine=virtual_engine)
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input: ModelInputForXPUWithSamplingMetadata,
-        kv_caches: List[torch.Tensor],
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-    ) -> Optional[List[SamplerOutput]]:
-        if num_steps > 1:
-            raise ValueError(
-                "XPUModelRunner does not support multi-step execution.")
-
-        model_executable = self.model
-        if (self.observability_config is not None
-                and self.observability_config.collect_model_forward_time):
-            model_forward_start_time = time.time()
-        with set_forward_context(model_input.attn_metadata, self.vllm_config,
-                                 model_input.virtual_engine):
-            hidden_or_intermediate_states = model_executable(
-                input_ids=model_input.input_tokens,
-                positions=model_input.input_positions,
-                intermediate_tensors=intermediate_tensors,
-                **MultiModalKwargs.as_kwargs(
-                    model_input.multi_modal_kwargs or {},
-                    device=self.device,
-                ),
-            )
-        # Compute the logits in the last pipeline stage.
-        if not get_pp_group().is_last_rank:
-            return hidden_or_intermediate_states
-
-        if (self.observability_config is not None
-                and self.observability_config.collect_model_forward_time):
-            model_forward_end_time = time.time()
-
-        # Compute the logits.
-        logits = self.model.compute_logits(hidden_or_intermediate_states,
-                                           model_input.sampling_metadata)
-
-        # Only perform sampling in the driver worker.
-        if not self.is_driver_worker:
-            return []
-
-        if model_input.async_callback is not None:
-            model_input.async_callback()
-
-        # Sample the next token.
-        output: SamplerOutput = self.sampler(
-            logits=logits,
-            sampling_metadata=model_input.sampling_metadata,
-        )
-        if (self.observability_config is not None
-                and self.observability_config.collect_model_forward_time
-                and output is not None):
-            model_forward_time = (model_forward_end_time -
-                                  model_forward_start_time)
-            # If there are multiple workers, we are still tracking the latency
-            # from the start time of the driver worker to the end time of the
-            # driver worker. The model forward time will then end up covering
-            # the communication time as well.
-            output.model_forward_time = model_forward_time
-
-        return [output]
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
deleted file mode 100644
index fe321c059f526..0000000000000
--- a/vllm/worker/xpu_worker.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""A XPU worker class."""
-import gc
-import os
-from typing import List, Optional, Tuple
-
-import intel_extension_for_pytorch  # noqa: F401
-import oneccl_bindings_for_pytorch  # noqa: F401
-import torch
-import torch.distributed
-
-from vllm.config import VllmConfig
-from vllm.distributed import (ensure_model_parallel_initialized,
-                              init_distributed_environment)
-from vllm.distributed.parallel_state import get_pp_group
-from vllm.logger import init_logger
-from vllm.model_executor import set_random_seed
-from vllm.platforms import current_platform
-from vllm.worker.cache_engine import CacheEngine
-from vllm.worker.worker import Worker
-from vllm.worker.worker_base import LoRANotSupportedWorkerBase, WorkerBase
-from vllm.worker.xpu_model_runner import XPUModelRunner
-
-logger = init_logger(__name__)
-
-
-class XPUWorker(LoRANotSupportedWorkerBase, Worker):
-    """A worker class that executes (a partition of) the model on a GPU.
-
-    Each worker is associated with a single XPU device. The worker is 
-    responsible for maintaining the KV cache and executing the model on the 
-    XPU. In case of distributed inference, each worker is assigned a partition
-    of the model.
-    """
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        local_rank: int,
-        rank: int,
-        distributed_init_method: str,
-        is_driver_worker: bool = False,
-    ) -> None:
-        WorkerBase.__init__(self, vllm_config=vllm_config)
-        device_config = self.device_config
-        parallel_config = self.parallel_config
-        assert device_config.device_type == "xpu"
-        assert current_platform.is_xpu()
-
-        self.parallel_config.rank = rank
-
-        self.local_rank = local_rank
-        self.rank = rank
-        self.distributed_init_method = distributed_init_method
-        self.is_driver_worker = is_driver_worker
-        if parallel_config and is_driver_worker:
-            assert rank % parallel_config.tensor_parallel_size == 0, \
-                   "Driver worker should be rank 0 of tensor parallel group."
-
-        self.model_runner = XPUModelRunner(  # type: ignore
-            vllm_config=vllm_config,
-            kv_cache_dtype=self.cache_config.cache_dtype,
-            is_driver_worker=is_driver_worker,
-        )
-        # Uninitialized cache engine. Will be initialized by
-        # initialize_cache.
-        self.cache_engine: List[CacheEngine]
-        self.gpu_cache: Optional[List[List[torch.Tensor]]]
-
-    def init_device(self) -> None:
-        if self.device_config.device.type == "xpu" and current_platform.is_xpu(
-        ):
-            self.device = torch.device(f"xpu:{self.local_rank}")
-            torch.xpu.set_device(self.device)
-            torch.xpu.empty_cache()
-            self.init_gpu_memory = torch.xpu.get_device_properties(
-                self.local_rank).total_memory
-        else:
-            raise RuntimeError(
-                f"Not support device type: {self.device_config.device}")
-        # Initialize the distributed environment.
-        self.init_worker_distributed_environment()
-        # Initialize the model.
-        set_random_seed(self.model_config.seed)
-
-    # keep this method for `empty_cache` and `synchronize` api
-    @torch.inference_mode()
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Profiles the peak memory usage of the model to determine how many
-        KV blocks may be allocated without OOMs.
-
-        The engine will first conduct a profiling of the existing memory usage.
-        Then, it calculate the maximum possible number of GPU and CPU blocks
-        that can be allocated with the remaining free memory.
-
-        Tip:
-            You may limit the usage of GPU memory
-            by adjusting the `gpu_memory_utilization` parameter.
-        """
-        # Profile the memory usage of the model and get the maximum number of
-        # cache blocks that can be allocated with the remaining free memory.
-        torch.xpu.empty_cache()
-
-        # Execute a forward pass with dummy inputs to profile the memory usage
-        # of the model.
-        self.model_runner.profile_run()
-
-        # Calculate the number of blocks that can be allocated with the
-        # profiled peak memory.
-        torch.xpu.synchronize()
-        used_memory = torch.xpu.memory_allocated()
-        total_gpu_memory = torch.xpu.get_device_properties(
-            self.local_rank).total_memory
-        free_gpu_memory = total_gpu_memory - used_memory
-
-        # NOTE(woosuk): Here we assume that the other processes using the same
-        # GPU did not change their memory usage during the profiling.
-        peak_memory = self.init_gpu_memory - free_gpu_memory
-        assert peak_memory > 0, (
-            "Error in memory profiling. "
-            f"Initial free memory {self.init_gpu_memory}, current free memory"
-            f" {free_gpu_memory}. This happens when the GPU memory was "
-            "not properly cleaned up before initializing the vLLM instance.")
-
-        cache_block_size = self.get_cache_block_size_bytes()
-        num_gpu_blocks = int(
-            (total_gpu_memory * self.cache_config.gpu_memory_utilization -
-             peak_memory) // cache_block_size)
-        num_cpu_blocks = int(self.cache_config.swap_space_bytes //
-                             cache_block_size)
-        num_gpu_blocks = max(num_gpu_blocks, 0)
-        num_cpu_blocks = max(num_cpu_blocks, 0)
-        gc.collect()
-        torch.xpu.empty_cache()
-        return num_gpu_blocks, num_cpu_blocks
-
-    def _warm_up_model(self) -> None:
-        # IPEX don't support capture graph yet
-        pass
-
-    def init_worker_distributed_environment(self) -> None:
-        """Initialize the distributed environment."""
-
-        parallel_config = self.parallel_config
-        rank = self.rank
-        distributed_init_method = self.distributed_init_method
-
-        if torch.distributed.is_initialized():
-            torch_world_size = torch.distributed.get_world_size()
-            if torch_world_size != parallel_config.world_size:
-                raise RuntimeError(
-                    "torch.distributed is already initialized but the torch "
-                    "world size does not match parallel_config.world_size "
-                    f"({torch_world_size} vs. {parallel_config.world_size}).")
-        elif not distributed_init_method:
-            raise ValueError(
-                "distributed_init_method must be set if torch.distributed "
-                "is not already initialized")
-        else:
-            # use sockets as default Level zero IPC exchange backend. By
-            # default oneccl will use `drmfd` as mechanism which need extra
-            # dependency (libdrm and drm headers) on your system.
-            ENV_CCL_ATL_TRANSPORT = os.getenv("CCL_ATL_TRANSPORT", "ofi")
-            ENV_LOCAL_WORLD_SIZE = os.getenv("LOCAL_WORLD_SIZE",
-                                             str(parallel_config.world_size))
-            os.environ["CCL_ATL_TRANSPORT"] = ENV_CCL_ATL_TRANSPORT
-            os.environ["LOCAL_WORLD_SIZE"] = ENV_LOCAL_WORLD_SIZE
-            os.environ["LOCAL_RANK"] = str(self.local_rank)
-            init_distributed_environment(
-                world_size=parallel_config.world_size,
-                rank=rank,
-                distributed_init_method=distributed_init_method,
-                local_rank=self.local_rank,
-                backend="ccl")
-
-        ensure_model_parallel_initialized(
-            parallel_config.tensor_parallel_size,
-            parallel_config.pipeline_parallel_size)
-        # global all_reduce needed for overall oneccl warm up
-        torch.distributed.all_reduce(torch.zeros(1).xpu())
-
-        if parallel_config.pipeline_parallel_size > 1:
-            # Add pp group init to avoid
-            # p2p communication as the first call
-            get_pp_group().all_reduce(torch.zeros(1).xpu())

From 9fb52e523abf7bdaf7e60cf2971edb5a1b13dc08 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 7 Jul 2025 00:54:36 +0800
Subject: [PATCH 118/167] [V1] Support any head size for FlexAttention backend
 (#20467)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../scripts/hardware_ci/run-amd-test.sh       |  3 +-
 docs/models/supported_models.md               | 10 +----
 examples/offline_inference/vision_language.py |  4 +-
 .../attention/test_attention_selector.py      | 13 +++++-
 .../multimodal/generation/test_common.py      |  9 +---
 tests/models/quantization/test_gguf.py        |  2 +-
 tests/models/registry.py                      | 15 +++----
 tests/models/test_initialization.py           |  3 +-
 vllm/attention/layer.py                       |  3 +-
 vllm/attention/selector.py                    | 33 ++++++++++++--
 vllm/config.py                                |  2 +-
 vllm/platforms/cuda.py                        | 44 ++++++++++++-------
 vllm/v1/attention/backends/cpu_attn.py        | 20 ++++++++-
 vllm/v1/attention/backends/flash_attn.py      | 22 ++++++----
 vllm/v1/attention/backends/flashinfer.py      | 26 ++++++-----
 vllm/v1/attention/backends/flex_attention.py  | 39 ++++++++--------
 vllm/v1/attention/backends/mla/common.py      | 23 ++++++----
 vllm/v1/attention/backends/rocm_aiter_fa.py   | 24 +++++-----
 vllm/v1/attention/backends/triton_attn.py     | 21 ++++++---
 vllm/v1/spec_decode/eagle.py                  |  4 +-
 20 files changed, 202 insertions(+), 118 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 6e9af1e721bb7..156456c92e63c 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -107,10 +107,9 @@ fi
 
 if [[ $commands == *" kernels/attention"* ]]; then
   commands="${commands} \
-  --ignore=kernels/attention/stest_attention_selector.py \
+  --ignore=kernels/attention/test_attention_selector.py \
   --ignore=kernels/attention/test_blocksparse_attention.py \
   --ignore=kernels/attention/test_encoder_decoder_attn.py \
-  --ignore=kernels/attention/test_attention_selector.py \
   --ignore=kernels/attention/test_flash_attn.py \
   --ignore=kernels/attention/test_flashinfer.py \
   --ignore=kernels/attention/test_prefix_prefill.py \
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 23d71fd445254..7ec91df98b285 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -626,9 +626,6 @@ Specified using `--task generate`.
 !!! note
     Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently.
 
-!!! note
-    `h2oai/h2ovl-mississippi-2b` will be available in V1 once we support head size 80.
-
 !!! note
     To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
 
@@ -671,11 +668,8 @@ Specified using `--task generate`.
     Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
 
 !!! note
-    To use Qwen2.5-Omni, you have to install Hugging Face Transformers library from source via
-    `pip install git+https://github.com/huggingface/transformers.git`.
-
-    Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
-    `--mm-processor-kwargs '{"use_audio_in_video": true}'`.
+    For Qwen2.5-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`)
+    is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
 
 #### Transcription
 
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index bf7be33107dae..5bd75a78f2c4b 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -98,7 +98,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
     # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
     prompts = [f"Question: {question} Answer:" for question in questions]
     engine_args = EngineArgs(
-        model="Salesforce/blip2-opt-6.7b",
+        model="Salesforce/blip2-opt-2.7b",
         limit_mm_per_prompt={modality: 1},
     )
 
@@ -971,7 +971,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
-# Qwen
+# Qwen-VL
 def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index 7d7522c1fc00c..93bf20da4adba 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -172,7 +172,7 @@ def test_env(
                     expected = "FLASHINFER_VLLM_V1" if use_v1 else name
                     assert backend.get_name() == expected
                 else:
-                    backend = get_attn_backend(16,
+                    backend = get_attn_backend(32,
                                                torch.float16,
                                                torch.float16,
                                                block_size,
@@ -181,6 +181,17 @@ def test_env(
                     expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name
                     assert backend.get_name() == expected
 
+                    if use_v1:
+                        backend = get_attn_backend(16,
+                                                   torch.float16,
+                                                   torch.float16,
+                                                   block_size,
+                                                   False,
+                                                   use_mla=use_mla)
+                        assert backend.get_name() == "FLEX_ATTENTION", (
+                            "Should fallback to FlexAttention if head size is "
+                            "not supported by FlashAttention")
+
 
 @pytest.mark.parametrize("device", ["cpu", "cuda"])
 @pytest.mark.parametrize("use_v1", [True, False])
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 6ecf6db56cb39..cbc2e9c87a644 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -33,9 +33,6 @@ if current_platform.is_rocm():
     os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
 
 REQUIRES_V0_MODELS = [
-    # V1 Test: no way to fall back for head_dim = 80
-    # https://github.com/vllm-project/vllm/issues/14524
-    "qwen_vl",
     # V1 Test: not enough KV cache space in C1.
     "fuyu",
 ]
@@ -221,8 +218,7 @@ VLM_TEST_SETTINGS = {
         marks=[large_gpu_mark(min_gb=32)],
     ),
     "blip2": VLMTestInfo(
-        # TODO: Change back to 2.7b once head_dim = 80 is supported
-        models=["Salesforce/blip2-opt-6.7b"],
+        models=["Salesforce/blip2-opt-2.7b"],
         test_type=VLMTestType.IMAGE,
         prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
         img_idx_to_prompt=lambda idx: "",
@@ -340,8 +336,7 @@ VLM_TEST_SETTINGS = {
     "h2ovl": VLMTestInfo(
         models = [
             "h2oai/h2ovl-mississippi-800m",
-            # TODO: Re-enable once head_dim = 80 is supported
-            # "h2oai/h2ovl-mississippi-2b",
+            "h2oai/h2ovl-mississippi-2b",
         ],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
diff --git a/tests/models/quantization/test_gguf.py b/tests/models/quantization/test_gguf.py
index a424bd6798fdb..3e77d3e710393 100644
--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@@ -83,7 +83,7 @@ MODELS = [
     QWEN2_CONFIG,
     PHI3_CONFIG,
     GPT2_CONFIG,
-    # STABLELM_CONFIG,  # enable this when v1 support head_size=80
+    STABLELM_CONFIG,
     DOLPHIN_CONFIG,
     # STARCODER_CONFIG, # broken
 ]
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 728c18643a003..aba01cefe993e 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -240,8 +240,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat",
                                         trust_remote_code=True),
     "PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"),
-    "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2", v0_only=True),
+    "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"),
     "Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"),
+    # Blocksparse attention not supported in V1 yet
     "Phi3SmallForCausalLM": _HfExamplesInfo("microsoft/Phi-3-small-8k-instruct",
                                             trust_remote_code=True,
                                             v0_only=True),
@@ -258,10 +259,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
     "Qwen3ForSequenceClassification": _HfExamplesInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls"),  # noqa: E501
     "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
-    "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b",  # noqa: E501
-                                                v0_only=True),
-    "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t",
-                                           v0_only=True),
+    "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"),  # noqa: E501
+    "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
     "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
     "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
     "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
@@ -330,8 +329,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
     "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"), # noqa: E501
     "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b",  # noqa: E501
-                                                     extras={"6b": "Salesforce/blip2-opt-6.7b"},  # noqa: E501
-                                                     v0_only=True),
+                                                     extras={"6b": "Salesforce/blip2-opt-6.7b"}),  # noqa: E501
     "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),  # noqa: E501
     "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny",  # noqa: E501
                                                 extras={"fork": "Isotr0py/deepseek-vl2-tiny"},  # noqa: E501
@@ -359,8 +357,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                     trust_remote_code=True),
     "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
                                                       extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},  # noqa: E501
-                                                      trust_remote_code=True,
-                                                      v0_only=True),
+                                                      trust_remote_code=True),
     "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct",   # noqa: E501
                                                       max_model_len=10240),
     "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 0ac684fdd30dc..25bc96bf32666 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -22,7 +22,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
     model_info.check_transformers_version(on_fail="skip")
 
     # FIXME: Possible memory leak in the previous tests?
-    if model_arch == "GraniteSpeechForConditionalGeneration":
+    if model_arch in ("GraniteSpeechForConditionalGeneration",
+                      "KimiVLForConditionalGeneration"):
         pytest.skip("Avoid OOM")
 
     # Avoid OOM and reduce initialization time by only using 1 layer
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 0c79aaf135518..f0ad68b16405e 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -310,7 +310,8 @@ class MultiHeadAttention(nn.Module):
             # currently, only torch_sdpa is supported on rocm
             self.attn_backend = _Backend.TORCH_SDPA
         else:
-            if backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
+            if backend in (_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1,
+                           _Backend.FLEX_ATTENTION):
                 backend = _Backend.XFORMERS
 
             self.attn_backend = backend if backend in {
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index cb577fa673023..df14aea729f3c 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -4,7 +4,7 @@
 import os
 from contextlib import contextmanager
 from functools import cache
-from typing import Generator, Optional, Type
+from typing import Generator, Optional, Union
 
 import torch
 
@@ -79,6 +79,33 @@ def get_global_forced_attn_backend() -> Optional[_Backend]:
     return forced_attn_backend
 
 
+def supports_head_size(
+    attn_backend: Union[str, type[AttentionBackend]],
+    head_size: int,
+) -> bool:
+    if isinstance(attn_backend, str):
+        try:
+            attn_backend = resolve_obj_by_qualname(attn_backend)
+        except ImportError:
+            return False
+
+    assert isinstance(attn_backend, type)
+
+    # TODO: Update the interface once V0 is removed
+    if get_supported_head_sizes := getattr(attn_backend,
+                                           "get_supported_head_sizes", None):
+        return head_size in get_supported_head_sizes()
+    if validate_head_size := getattr(attn_backend, "validate_head_size", None):
+        try:
+            validate_head_size(head_size)
+            return True
+        except Exception:
+            return False
+
+    raise NotImplementedError(f"{attn_backend.__name__} does not support "
+                              "head size validation")
+
+
 def get_attn_backend(
     head_size: int,
     dtype: torch.dtype,
@@ -87,7 +114,7 @@ def get_attn_backend(
     is_attention_free: bool,
     is_blocksparse: bool = False,
     use_mla: bool = False,
-) -> Type[AttentionBackend]:
+) -> type[AttentionBackend]:
     """Selects which attention backend to use and lazily imports it."""
     # Accessing envs.* behind an @lru_cache decorator can cause the wrong
     # value to be returned from the cache if the value changes between calls.
@@ -115,7 +142,7 @@ def _cached_get_attn_backend(
     is_blocksparse: bool = False,
     use_v1: bool = False,
     use_mla: bool = False,
-) -> Type[AttentionBackend]:
+) -> type[AttentionBackend]:
     if is_blocksparse:
         logger.info("Using BlocksparseFlashAttention backend.")
         from vllm.attention.backends.blocksparse_attn import (
diff --git a/vllm/config.py b/vllm/config.py
index a1d8c32953b04..724f69a3887f0 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2319,7 +2319,7 @@ class SchedulerConfig:
 
         if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len:
             logger.warning(
-                "max_num_batched_tokens (%d) exceeds max_num_seqs"
+                "max_num_batched_tokens (%d) exceeds max_num_seqs "
                 "* max_model_len (%d). This may lead to unexpected behavior.",
                 self.max_num_batched_tokens,
                 self.max_num_seqs * self.max_model_len)
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index f82c1e569977c..0a5f4004e4488 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -234,35 +234,44 @@ class CudaPlatformBase(Platform):
                         return ("vllm.attention.backends."
                                 "flashmla.FlashMLABackend")
         if use_v1:
+            FLASHINFER_V1 = "vllm.v1.attention.backends.flashinfer.FlashInferBackend"  # noqa: E501
+            FLEX_ATTENTION_V1 = "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"  # noqa: E501
+            TRITON_ATTN_VLLM_V1 = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"  # noqa: E501
+            FLASH_ATTN_V1 = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"  # noqa: E501
+
             if selected_backend == _Backend.FLASHINFER:
                 logger.info_once("Using FlashInfer backend on V1 engine.")
-                return "vllm.v1.attention.backends.flashinfer.FlashInferBackend"
+                return FLASHINFER_V1
             elif selected_backend == _Backend.FLEX_ATTENTION:
-                logger.info("Using FlexAttenion backend on V1 engine.")
-                return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"  # noqa: E501
+                logger.info_once("Using FlexAttention backend on V1 engine.")
+                return FLEX_ATTENTION_V1
             elif selected_backend == _Backend.TRITON_ATTN_VLLM_V1:
                 logger.info_once("Using Triton backend on V1 engine.")
-                return ("vllm.v1.attention.backends."
-                        "triton_attn.TritonAttentionBackend")
+                return TRITON_ATTN_VLLM_V1
             elif selected_backend == _Backend.FLASH_ATTN:
                 logger.info_once("Using Flash Attention backend on V1 engine.")
-                return ("vllm.v1.attention.backends."
-                        "flash_attn.FlashAttentionBackend")
+                return FLASH_ATTN_V1
+
+            from vllm.attention.selector import supports_head_size
 
             # Default backends for V1 engine
-            # Prefer FlashInfer for Blackwell GPUs if installed
+            # FP32 is only supported by FlexAttention
             if dtype not in (torch.float16, torch.bfloat16):
                 logger.info_once(
-                    f"Using FlexAttenion backend for {dtype} on V1 engine.")
-                return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"  # noqa: E501
-            if cls.is_device_capability(100):
+                    "Using FlexAttention backend for %s on V1 engine.",
+                    dtype,
+                )
+                return FLEX_ATTENTION_V1
+
+            # Prefer FlashInfer for Blackwell GPUs if installed
+            if cls.is_device_capability(100) and \
+                supports_head_size(FLASHINFER_V1, head_size):
                 try:
                     import flashinfer  # noqa: F401
                     logger.info_once(
                         "Using FlashInfer backend on V1 engine by default for "
                         "Blackwell (SM 10.0) GPUs.")
-                    return ("vllm.v1.attention.backends."
-                            "flashinfer.FlashInferBackend")
+                    return FLASHINFER_V1
                 except ImportError:
                     logger.info_once(
                         "FlashInfer failed to import for V1 engine on "
@@ -270,10 +279,13 @@ class CudaPlatformBase(Platform):
                         "install FlashInfer for better performance.")
                     pass
             # FlashAttention is the default for SM 8.0+ GPUs
-            if cls.has_device_capability(80):
+            if cls.has_device_capability(80) and \
+                supports_head_size(FLASH_ATTN_V1, head_size):
                 logger.info_once("Using Flash Attention backend on V1 engine.")
-                return ("vllm.v1.attention.backends."
-                        "flash_attn.FlashAttentionBackend")
+                return FLASH_ATTN_V1
+
+            logger.info_once("Using FlexAttention backend on V1 engine.")
+            return FLEX_ATTENTION_V1
 
         # Backends for V0 engine
         if selected_backend == _Backend.FLASHINFER:
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index e493f1b8088bf..37c04c7a029e5 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -3,7 +3,8 @@
 import numpy as np
 import torch
 
-from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.backends.abstract import (AttentionBackend,
+                                              AttentionMetadata)
 from vllm.attention.backends.torch_sdpa import (TorchSDPABackendImpl,
                                                 TorchSDPAMetadata)
 from vllm.attention.backends.utils import CommonAttentionState
@@ -17,9 +18,24 @@ from vllm.v1.worker.cpu_model_runner import CPUModelRunner
 from vllm.v1.worker.gpu_input_batch import InputBatch
 
 
-class TorchSDPABackend:
+class TorchSDPABackend(AttentionBackend):
     accept_output_buffer: bool = False
 
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return PagedAttention.get_supported_head_sizes()
+
+    @classmethod
+    def validate_head_size(cls, head_size: int) -> None:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
+            attn_type = cls.__name__.removesuffix("Backend")
+            raise ValueError(
+                f"Head size {head_size} is not supported by {attn_type}. "
+                f"Supported head sizes are: {supported_head_sizes}. "
+                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
+                "FlexAttention backend which supports all head sizes.")
+
     @staticmethod
     def get_name() -> str:
         return "TORCH_SDPA_VLLM_V1"
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 6182b2f9b2bd4..fbc13c06c65aa 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -44,10 +44,21 @@ class FlashAttentionBackend(AttentionBackend):
 
     accept_output_buffer: bool = True
 
-    @staticmethod
-    def get_supported_head_sizes() -> list[int]:
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
         return [32, 64, 96, 128, 160, 192, 224, 256]
 
+    @classmethod
+    def validate_head_size(cls, head_size: int) -> None:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
+            attn_type = cls.__name__.removesuffix("Backend")
+            raise ValueError(
+                f"Head size {head_size} is not supported by {attn_type}. "
+                f"Supported head sizes are: {supported_head_sizes}. "
+                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
+                "FlexAttention backend which supports all head sizes.")
+
     @staticmethod
     def get_name() -> str:
         return "FLASH_ATTN_VLLM_V1"
@@ -416,12 +427,7 @@ class FlashAttentionImpl(AttentionImpl):
 
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        support_head_sizes = FlashAttentionBackend.get_supported_head_sizes()
-        if head_size not in support_head_sizes:
-            raise ValueError(
-                f"Head size {head_size} is not supported by FlashAttention. "
-                f"Supported head sizes are: {support_head_sizes}. "
-                "Set VLLM_USE_V1=0 to use another attention backend.")
+        FlashAttentionBackend.validate_head_size(head_size)
 
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 03a2ed7139c7c..860309faa9053 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -38,10 +38,22 @@ class FlashInferBackend(AttentionBackend):
 
     accept_output_buffer: bool = True
 
-    @staticmethod
-    def get_supported_head_sizes() -> list[int]:
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
         return [64, 128, 256]
 
+    @classmethod
+    def validate_head_size(cls, head_size: int) -> None:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
+            attn_type = cls.__name__.removesuffix("Backend")
+            raise ValueError(
+                f"Head size {head_size} is not supported by {attn_type}. "
+                f"Supported head sizes are: {supported_head_sizes}. "
+                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
+                "FlexAttention backend which supports all head sizes.")
+
     @staticmethod
     def get_name() -> str:
         return "FLASHINFER_VLLM_V1"
@@ -207,14 +219,8 @@ class FlashInferMetadata:
         return self.qo_indptr
 
     def __post_init__(self):
-        # Refer to
-        # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
-        supported_head_sizes = FlashInferBackend.get_supported_head_sizes()
-        if self.head_dim is not None and self.head_dim \
-                not in supported_head_sizes:
-            raise ValueError(
-                f"Only {supported_head_sizes} are supported for head_dim,",
-                f" received {self.head_dim}.")
+        if self.head_dim is not None:
+            FlashInferBackend.validate_head_size(self.head_dim)
 
 
 class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index ebd5914ee40ac..a8c5f464aa326 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with FlashAttention."""
-
+from collections import defaultdict
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional
 
@@ -21,9 +21,6 @@ from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
 from vllm.v1.kv_cache_interface import AttentionSpec
 from vllm.v1.worker.block_table import BlockTable
 
-if current_platform.is_cuda():
-    pass
-
 logger = init_logger(__name__)
 
 if TYPE_CHECKING:
@@ -45,9 +42,9 @@ def _offsets_to_doc_ids_tensor(offsets: torch.Tensor) -> torch.Tensor:
 class FlexAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
 
-    @staticmethod
-    def get_supported_head_sizes() -> list[int]:
-        return [16, 32, 64, 96, 128, 160, 192, 224, 256]
+    @classmethod
+    def validate_head_size(cls, head_size: int) -> None:
+        return  # FlexAttention supports any head size
 
     @staticmethod
     def get_name() -> str:
@@ -384,12 +381,8 @@ class FlexAttentionImpl(AttentionImpl):
             raise NotImplementedError(
                 "FlexAttention does not support kv sharing yet.")
 
-        support_head_sizes = FlexAttentionBackend.get_supported_head_sizes()
-        if head_size not in support_head_sizes:
-            raise ValueError(
-                f"Head size {head_size} is not supported by FlashAttention. "
-                f"Supported head sizes are: {support_head_sizes}. "
-                "Set VLLM_USE_V1=0 to use another attention backend.")
+        FlexAttentionBackend.validate_head_size(head_size)
+
         if is_quantized_kv_cache(self.kv_cache_dtype):
             raise NotImplementedError(
                 "FlexAttention does not support quantized kv-cache. Yet")
@@ -464,12 +457,20 @@ class FlexAttentionImpl(AttentionImpl):
         # Doesn't work for now -> constraint violation
         # torch._dynamo.try_mark_dynamic(query, 2)
 
-        # default M=64, N=64 may run out of shared memory on
-        # some GPUs with fp32, so we use smaller M and N.
-        extra_kernel_options = {
-            "BLOCK_M": 32,
-            "BLOCK_N": 32
-        } if query.dtype == torch.float32 else {}
+        # default M=64, N=64 may run out of shared memory on some GPUs
+        # TODO: Explicit configs for each GPU?
+        # Not sure how to calculate the shared memory requirement
+        extra_kernel_options = defaultdict[str, int](lambda: 64)
+        if query.dtype == torch.float32:
+            extra_kernel_options["BLOCK_M"] //= 2
+            extra_kernel_options["BLOCK_N"] //= 2
+        if current_platform.is_cuda():
+            device_props = torch.cuda.get_device_properties()
+            max_shared_memory = device_props.shared_memory_per_block_optin
+            if max_shared_memory < 144 * 1024:
+                extra_kernel_options["BLOCK_M"] //= 2
+                extra_kernel_options["BLOCK_N"] //= 2
+
         out = flex_attention_compiled(
             query,
             key_cache,
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 39379b11863cb..f2aaf59a40f88 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -254,10 +254,21 @@ class MLACommonBackend(AttentionBackend):
     ) -> tuple[int, ...]:
         return (num_blocks, block_size, head_size)
 
-    @staticmethod
-    def get_supported_head_sizes() -> list[int]:
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
         return [576]
 
+    @classmethod
+    def validate_head_size(cls, head_size: int) -> None:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
+            attn_type = cls.__name__.removesuffix("Backend")
+            raise ValueError(
+                f"Head size {head_size} is not supported by {attn_type}. "
+                f"Supported head sizes are: {supported_head_sizes}. "
+                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
+                "FlexAttention backend which supports all head sizes.")
+
 
 @dataclass
 class MLACommonPrefillMetadata:
@@ -320,12 +331,8 @@ class MLACommonMetadata(Generic[D]):
     prefill: Optional[MLACommonPrefillMetadata] = None
 
     def __post_init__(self):
-        supported_head_sizes = MLACommonBackend.get_supported_head_sizes()
-        if self.head_dim is not None and self.head_dim \
-                not in supported_head_sizes:
-            raise ValueError(
-                f"Only {supported_head_sizes} are supported for head_dim,",
-                f"received {self.head_dim}.")
+        if self.head_dim is not None:
+            MLACommonBackend.validate_head_size(self.head_dim)
 
 
 M = TypeVar("M", bound=MLACommonMetadata)
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index 63537384a1dae..6a78b03dce86e 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -314,10 +314,21 @@ class AiterFlashAttentionBackend(AttentionBackend):
 
     accept_output_buffer: bool = True
 
-    @staticmethod
-    def get_supported_head_sizes() -> list[int]:
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
         return [32, 64, 96, 128, 160, 192, 224, 256]
 
+    @classmethod
+    def validate_head_size(cls, head_size: int) -> None:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
+            attn_type = cls.__name__.removesuffix("Backend")
+            raise ValueError(
+                f"Head size {head_size} is not supported by {attn_type}. "
+                f"Supported head sizes are: {supported_head_sizes}. "
+                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
+                "FlexAttention backend which supports all head sizes.")
+
     @staticmethod
     def get_name() -> str:
         return "FLASH_ATTN_VLLM_V1"
@@ -428,14 +439,7 @@ class AiterFlashAttentionImpl(AttentionImpl):
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        support_head_sizes = \
-            AiterFlashAttentionBackend.get_supported_head_sizes()
-        if head_size not in support_head_sizes:
-            raise ValueError(
-                f"Head size {head_size} is not supported by "
-                "AiterFlashAttention. "
-                f"Supported head sizes are: {support_head_sizes}. "
-                "Set VLLM_USE_V1=0 to use another attention backend.")
+        AiterFlashAttentionBackend.validate_head_size(head_size)
 
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 4c5a1a755c1a6..cdaff2f6a40fa 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -190,10 +190,21 @@ class TritonAttentionBackend(AttentionBackend):
 
     accept_output_buffer: bool = True
 
-    @staticmethod
-    def get_supported_head_sizes() -> list[int]:
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
         return [32, 64, 96, 128, 160, 192, 224, 256]
 
+    @classmethod
+    def validate_head_size(cls, head_size: int) -> None:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
+            attn_type = cls.__name__.removesuffix("Backend")
+            raise ValueError(
+                f"Head size {head_size} is not supported by {attn_type}. "
+                f"Supported head sizes are: {supported_head_sizes}. "
+                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
+                "FlexAttention backend which supports all head sizes.")
+
     @staticmethod
     def get_name() -> str:
         return "TRITON_ATTN_VLLM_V1"
@@ -268,11 +279,7 @@ class TritonAttentionImpl(AttentionImpl):
 
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        support_head_sizes = TritonAttentionBackend.get_supported_head_sizes()
-        if head_size not in support_head_sizes:
-            raise ValueError(
-                f"Head size {head_size} is not supported by TritonAttention. "
-                f"Supported head sizes are: {support_head_sizes}.")
+        TritonAttentionBackend.validate_head_size(head_size)
 
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 156f5764e8dc0..6661d984a7717 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -12,8 +12,8 @@ from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.models import supports_multimodal
 from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
-from vllm.v1.attention.backends.flash_attn import (CommonAttentionMetadata,
-                                                   FlashAttentionMetadata)
+from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
+from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.spec_decode.utils import prepare_eagle_input_kernel

From 9528e3a05e4264e7477bfe4e30a4e1d8fa54e1dc Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 6 Jul 2025 12:44:52 -0700
Subject: [PATCH 119/167] [BugFix][Spec Decode] Fix spec token ids in model
 runner (#20530)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 57d0c7b50ff51..5a26e88db1f77 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -528,19 +528,20 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                     start_token_index:end_token_index] = new_token_ids
                 self.input_batch.num_tokens_no_spec[
                     req_index] = end_token_index
-                # Add spec_token_ids to token_ids_cpu.
-                spec_token_ids = (
-                    scheduler_output.scheduled_spec_decode_tokens.get(
-                        req_id, ()))
-                if spec_token_ids:
-                    start_index = end_token_index
-                    end_token_index += len(spec_token_ids)
-                    self.input_batch.token_ids_cpu[
-                        req_index,
-                        start_index:end_token_index] = spec_token_ids
-                # NOTE(woosuk): `num_tokens` here may include spec tokens.
                 self.input_batch.num_tokens[req_index] = end_token_index
 
+            # Add spec_token_ids to token_ids_cpu.
+            spec_token_ids = (
+                scheduler_output.scheduled_spec_decode_tokens.get(req_id, ()))
+            if spec_token_ids:
+                num_spec_tokens = len(spec_token_ids)
+                start_index = self.input_batch.num_tokens_no_spec[req_index]
+                end_token_index = start_index + num_spec_tokens
+                self.input_batch.token_ids_cpu[
+                    req_index, start_index:end_token_index] = spec_token_ids
+                # NOTE(woosuk): `num_tokens` here may include spec tokens.
+                self.input_batch.num_tokens[req_index] += num_spec_tokens
+
         # Add the new or resumed requests to the persistent batch.
         # The smaller empty indices are filled first.
         for req_id in req_ids_to_add:

From c18b3b8e8bdcebaa150311d2c4911a6428480162 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 7 Jul 2025 05:01:48 +0800
Subject: [PATCH 120/167] [Bugfix] Add `use_cross_encoder` flag to use correct
 activation in `ClassifierPooler` (#20527)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/entrypoints/llm.py                  |  2 +-
 vllm/entrypoints/openai/protocol.py      | 10 +++---
 vllm/entrypoints/openai/serving_score.py | 10 +++---
 vllm/model_executor/layers/pooler.py     | 42 +++++++++++++++++-------
 vllm/model_executor/models/bert.py       |  5 ---
 vllm/model_executor/models/roberta.py    |  5 ---
 vllm/pooling_params.py                   |  3 ++
 vllm/transformers_utils/config.py        | 20 ++++++-----
 8 files changed, 56 insertions(+), 41 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 68f126c042836..6357c2a37c8fb 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1204,7 +1204,7 @@ class LLM:
 
         input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]
 
-        pooling_params = PoolingParams()
+        pooling_params = PoolingParams(use_cross_encoder=True)
 
         tokenization_kwargs: dict[str, Any] = {}
         _validate_truncation_size(self.llm_engine.model_config.max_model_len,
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 93d9c588d8d28..d4db238f456e3 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1156,8 +1156,9 @@ class ScoreRequest(OpenAIBaseModel):
 
     # --8<-- [end:score-extra-params]
 
-    def to_pooling_params(self):
-        return PoolingParams(additional_data=self.additional_data)
+    def to_pooling_params(self, *, use_cross_encoder: bool = False):
+        return PoolingParams(use_cross_encoder=use_cross_encoder,
+                             additional_data=self.additional_data)
 
 
 class RerankRequest(OpenAIBaseModel):
@@ -1182,8 +1183,9 @@ class RerankRequest(OpenAIBaseModel):
 
     # --8<-- [end:rerank-extra-params]
 
-    def to_pooling_params(self):
-        return PoolingParams(additional_data=self.additional_data)
+    def to_pooling_params(self, *, use_cross_encoder: bool = False):
+        return PoolingParams(use_cross_encoder=use_cross_encoder,
+                             additional_data=self.additional_data)
 
 
 class RerankDocument(BaseModel):
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 9f333c02ab529..328d4ff0e6c01 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -25,9 +25,7 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
-                                               PreTrainedTokenizer,
-                                               PreTrainedTokenizerFast)
+from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import make_async, merge_async_iterators
 
 logger = init_logger(__name__)
@@ -50,7 +48,7 @@ class ServingScores(OpenAIServing):
 
     async def _embedding_score(
         self,
-        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+        tokenizer: AnyTokenizer,
         texts_1: list[str],
         texts_2: list[str],
         request: Union[RerankRequest, ScoreRequest],
@@ -141,7 +139,7 @@ class ServingScores(OpenAIServing):
 
     async def _cross_encoding_score(
         self,
-        tokenizer: Union[AnyTokenizer],
+        tokenizer: AnyTokenizer,
         texts_1: list[str],
         texts_2: list[str],
         request: Union[RerankRequest, ScoreRequest],
@@ -190,7 +188,7 @@ class ServingScores(OpenAIServing):
         # Schedule the request and get the result generator.
         generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
 
-        pooling_params = request.to_pooling_params()
+        pooling_params = request.to_pooling_params(use_cross_encoder=True)
 
         for i, engine_prompt in enumerate(engine_prompts):
             request_id_item = f"{request_id}-{i}"
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 48adcc5fef84a..d864a915a0732 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -15,6 +15,7 @@ from vllm.model_executor.pooling_metadata import (  # noqa: E501
 from vllm.model_executor.pooling_metadata import PoolingTensors
 from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
 from vllm.transformers_utils.config import (
+    get_classification_activation_function,
     get_cross_encoder_activation_function)
 from vllm.v1.pool.metadata import PoolingMetadata as V1PoolingMetadata
 
@@ -388,15 +389,14 @@ class ClassifierPooler(nn.Module):
         self.classifier = classifier
         self.pooler = pooler
 
-        if config.task == "score":
-            self.default_activation_function = \
-                get_cross_encoder_activation_function(config.hf_config)
-        elif config.task == "classify":
-            self.default_activation_function = nn.Sigmoid() \
-                if config.hf_config.num_labels == 1 else nn.Softmax()
-        else:
-            raise NotImplementedError(f"task={config.task!r} is not supported"
-                                      " with the classification pooler")
+        self.classification_act_fn = get_classification_activation_function(
+            config.hf_config)
+        self.cross_encoder_act_fn = get_cross_encoder_activation_function(
+            config.hf_config)
+
+    def _get_act_fn(self, use_cross_encoder: bool):
+        return (self.cross_encoder_act_fn
+                if use_cross_encoder else self.classification_act_fn)
 
     def get_prompt_lens(
         self,
@@ -446,8 +446,28 @@ class ClassifierPooler(nn.Module):
             # apply classifier once on the full batch if possible
             pooled_output = self.classifier(pooled_output)
 
-        # shape: (batch_size, num_labels)
-        scores = self.default_activation_function(pooled_output)
+        if isinstance(pooling_metadata, V0PoolingMetadata):
+            use_cross_encoder_list = [
+                pooling_param.use_cross_encoder
+                for _, pooling_param in pooling_metadata.seq_groups
+            ]
+        else:
+            use_cross_encoder_list = [
+                pooling_param.use_cross_encoder
+                for pooling_param in pooling_metadata.pooling_params
+            ]
+
+        # shape of scores: (batch_size, num_labels)
+        if all(use_cross_encoder == use_cross_encoder_list[0]
+               for use_cross_encoder in use_cross_encoder_list):
+            act_fn = self._get_act_fn(use_cross_encoder_list[0])
+            scores = act_fn(pooled_output)
+        else:
+            scores = torch.stack([
+                self._get_act_fn(use_cross_encoder)(vecs)
+                for use_cross_encoder, vecs in zip(use_cross_encoder_list,
+                                                   pooled_output)
+            ])
 
         pooled_outputs = [PoolingSequenceGroupOutput(data) for data in scores]
         return PoolerOutput(outputs=pooled_outputs)
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index d6f6d9d1fb59a..6e955e1c51213 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -25,8 +25,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors, PoolerOutput
-from vllm.transformers_utils.config import (
-    get_cross_encoder_activation_function)
 
 from .interfaces import SupportsCrossEncoding, SupportsQuant, SupportsV0Only
 from .utils import WeightsMapper, maybe_prefix
@@ -462,9 +460,6 @@ class BertForSequenceClassification(nn.Module, SupportsV0Only,
         super().__init__()
         config = vllm_config.model_config.hf_config
 
-        self.default_activation_function = \
-            get_cross_encoder_activation_function(config)
-
         self.num_labels = config.num_labels
         self.bert = BertModel(vllm_config=vllm_config,
                               prefix=maybe_prefix(prefix, "bert"),
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index 8fa8b89798d00..048fa827fb2b9 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -18,8 +18,6 @@ from vllm.model_executor.models.bert import BertEmbeddingModel, BertModel
 from vllm.model_executor.models.utils import WeightsMapper, maybe_prefix
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors, PoolerOutput
-from vllm.transformers_utils.config import (
-    get_cross_encoder_activation_function)
 
 from .bert_with_rope import BertWithRope, JinaRobertaModel
 from .interfaces import SupportsCrossEncoding, SupportsV0Only
@@ -178,9 +176,6 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
         super().__init__()
         config = vllm_config.model_config.hf_config
 
-        self.default_activation_function = \
-            get_cross_encoder_activation_function(config)
-
         self.num_labels = config.num_labels
         self.roberta = BertModel(vllm_config=vllm_config,
                                  prefix=maybe_prefix(prefix, "bert"),
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index b5c327bdd256b..106f3e8b22b7f 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -24,12 +24,14 @@ class PoolingParams(
     """
 
     dimensions: Optional[int] = None
+    use_cross_encoder: bool = False
     additional_data: Optional[Any] = None
     output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY
 
     def clone(self) -> "PoolingParams":
         """Returns a deep copy of the PoolingParams instance."""
         return PoolingParams(dimensions=self.dimensions,
+                             use_cross_encoder=self.use_cross_encoder,
                              additional_data=self.additional_data)
 
     def verify(self, model_config: "ModelConfig") -> None:
@@ -54,6 +56,7 @@ class PoolingParams(
     def __repr__(self) -> str:
         return (f"PoolingParams("
                 f"dimensions={self.dimensions}, "
+                f"use_cross_encoder={self.use_cross_encoder}, "
                 f"additional_metadata={self.additional_data})")
 
     def __post_init__(self) -> None:
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 5c422a9e3fce2..9ccde292974cf 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -866,24 +866,26 @@ def try_get_generation_config(
             return None
 
 
+def get_classification_activation_function(config: PretrainedConfig):
+    return nn.Sigmoid() if config.num_labels == 1 else nn.Softmax()
+
+
 def get_cross_encoder_activation_function(config: PretrainedConfig):
-
     function_name: Optional[str] = None
-    if hasattr(config, "sentence_transformers") and "activation_fn" in \
-        config.sentence_transformers:
+    if (hasattr(config, "sentence_transformers")
+            and "activation_fn" in config.sentence_transformers):
         function_name = config.sentence_transformers["activation_fn"]
-
     elif (hasattr(config, "sbert_ce_default_activation_function")
           and config.sbert_ce_default_activation_function is not None):
         function_name = config.sbert_ce_default_activation_function
 
     if function_name is not None:
-        assert function_name.startswith("torch.nn.modules."), \
-            "Loading of activation functions is restricted to " \
-            "torch.nn.modules for security reasons"
+        assert function_name.startswith("torch.nn.modules."), (
+            "Loading of activation functions is restricted to "
+            "torch.nn.modules for security reasons")
         return resolve_obj_by_qualname(function_name)()
-    else:
-        return nn.Sigmoid() if config.num_labels == 1 else nn.Identity()
+
+    return nn.Sigmoid() if config.num_labels == 1 else nn.Identity()
 
 
 def try_get_safetensors_metadata(

From 462b26928051742c4dcac259fd0f04cc56fe581d Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 6 Jul 2025 18:32:13 -0700
Subject: [PATCH 121/167] Implement OpenAI Responses API [1/N] (#20504)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .../entrypoints/openai/test_openai_schema.py  |   4 +
 .../entrypoints/openai/responses/__init__.py  |   0
 .../entrypoints/openai/responses/conftest.py  |  32 ++
 .../openai/responses/test_basic.py            |  75 +++
 .../openai/responses/test_stateful.py         | 137 ++++++
 .../responses/test_structured_output.py       |  92 ++++
 vllm/entrypoints/chat_utils.py                |   4 +-
 vllm/entrypoints/openai/api_server.py         |  91 +++-
 vllm/entrypoints/openai/protocol.py           | 201 ++++++++
 vllm/entrypoints/openai/serving_engine.py     |   8 +-
 vllm/entrypoints/openai/serving_responses.py  | 464 ++++++++++++++++++
 vllm/reasoning/abs_reasoning_parsers.py       |   6 +-
 12 files changed, 1106 insertions(+), 8 deletions(-)
 create mode 100644 tests/v1/entrypoints/openai/responses/__init__.py
 create mode 100644 tests/v1/entrypoints/openai/responses/conftest.py
 create mode 100644 tests/v1/entrypoints/openai/responses/test_basic.py
 create mode 100644 tests/v1/entrypoints/openai/responses/test_stateful.py
 create mode 100644 tests/v1/entrypoints/openai/responses/test_structured_output.py
 create mode 100644 vllm/entrypoints/openai/serving_responses.py

diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py
index 4ded37595384e..aa87cd22fe44b 100644
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@@ -95,6 +95,10 @@ def test_openapi_stateless(case: schemathesis.Case):
         case.operation.method.upper(),
         case.operation.path,
     )
+    if case.operation.path.startswith("/v1/responses"):
+        # Skip responses API as it is meant to be stateful.
+        return
+
     timeout = {
         # requires a longer timeout
         ("POST", "/v1/chat/completions"):
diff --git a/tests/v1/entrypoints/openai/responses/__init__.py b/tests/v1/entrypoints/openai/responses/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/v1/entrypoints/openai/responses/conftest.py b/tests/v1/entrypoints/openai/responses/conftest.py
new file mode 100644
index 0000000000000..2dcdda04ecb57
--- /dev/null
+++ b/tests/v1/entrypoints/openai/responses/conftest.py
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+
+# Use a small reasoning model to test the responses API.
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",  # For faster startup.
+        "--reasoning-parser",
+        "deepseek_r1",
+    ]
+
+
+@pytest.fixture(scope="module")
+def server(default_server_args):
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
diff --git a/tests/v1/entrypoints/openai/responses/test_basic.py b/tests/v1/entrypoints/openai/responses/test_basic.py
new file mode 100644
index 0000000000000..974ea8673c44e
--- /dev/null
+++ b/tests/v1/entrypoints/openai/responses/test_basic.py
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import openai  # use the official client for correctness check
+import pytest
+
+
+@pytest.mark.asyncio
+async def test_simple_input(client: openai.AsyncOpenAI):
+    response = await client.responses.create(input="What is 13 * 24?")
+    print(response)
+
+    outputs = response.output
+    # Whether the output contains the answer.
+    assert outputs[-1].type == "message"
+    assert "312" in outputs[-1].content[0].text
+
+    # Whether the output contains the reasoning.
+    assert outputs[0].type == "reasoning"
+    assert outputs[0].text != ""
+
+
+@pytest.mark.asyncio
+async def test_instructions(client: openai.AsyncOpenAI):
+    response = await client.responses.create(
+        instructions="Finish the answer with QED.",
+        input="What is 13 * 24?",
+    )
+    print(response)
+
+    output_text = response.output[-1].content[0].text
+    assert "312" in output_text
+    assert "QED" in output_text
+
+
+@pytest.mark.asyncio
+async def test_chat(client: openai.AsyncOpenAI):
+    response = await client.responses.create(input=[
+        {
+            "role": "system",
+            "content": "Finish the answer with QED."
+        },
+        {
+            "role": "user",
+            "content": "What is 5 * 3?"
+        },
+        {
+            "role": "assistant",
+            "content": "15. QED."
+        },
+        {
+            "role": "user",
+            "content": "Multiply the result by 2."
+        },
+    ], )
+    print(response)
+
+    output_text = response.output[-1].content[0].text
+    assert "30" in output_text
+    assert "QED" in output_text
+
+
+@pytest.mark.asyncio
+async def test_chat_with_input_type(client: openai.AsyncOpenAI):
+    response = await client.responses.create(input=[
+        {
+            "role": "user",
+            "content": [{
+                "type": "input_text",
+                "text": "Hello!"
+            }],
+        },
+    ], )
+    print(response)
+    assert response.status == "completed"
diff --git a/tests/v1/entrypoints/openai/responses/test_stateful.py b/tests/v1/entrypoints/openai/responses/test_stateful.py
new file mode 100644
index 0000000000000..a2d581ef7ced8
--- /dev/null
+++ b/tests/v1/entrypoints/openai/responses/test_stateful.py
@@ -0,0 +1,137 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+
+import openai
+import pytest
+
+
+@pytest.mark.asyncio
+async def test_store(client: openai.AsyncOpenAI):
+    # By default, store is True.
+    response = await client.responses.create(input="Hello!")
+    assert response.status == "completed"
+
+    # Retrieve the response.
+    response = await client.responses.retrieve(response.id)
+    assert response.status == "completed"
+
+    # Test store=False.
+    response = await client.responses.create(
+        input="Hello!",
+        store=False,
+    )
+    assert response.status == "completed"
+
+    # The response should not be found.
+    with pytest.raises(openai.NotFoundError,
+                       match="Response with id .* not found."):
+        await client.responses.retrieve(response.id)
+
+
+@pytest.mark.asyncio
+async def test_background(client: openai.AsyncOpenAI):
+    # NOTE: This query should be easy enough for the model to answer
+    # within the 10 seconds.
+    response = await client.responses.create(
+        input="Hello!",
+        background=True,
+    )
+    assert response.status == "queued"
+
+    max_retries = 10
+    for _ in range(max_retries):
+        await asyncio.sleep(1)
+        response = await client.responses.retrieve(response.id)
+        if response.status != "queued":
+            break
+    print(response)
+
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+async def test_background_error(client: openai.AsyncOpenAI):
+    with pytest.raises(
+            openai.BadRequestError,
+            match="background can only be used when `store` is true"):
+        _ = await client.responses.create(
+            input="What is 13 * 24?",
+            background=True,
+            store=False,
+        )
+
+
+@pytest.mark.asyncio
+async def test_background_cancel(client: openai.AsyncOpenAI):
+    response = await client.responses.create(
+        input="Write a long story about a cat.",
+        background=True,
+    )
+    assert response.status == "queued"
+
+    # Cancel the response before it is completed.
+    # FIXME: This test can be flaky.
+    await asyncio.sleep(0.5)
+    response = await client.responses.cancel(response.id)
+    assert response.status == "cancelled"
+
+    # Make sure the response status remains unchanged.
+    await asyncio.sleep(5)
+    response = await client.responses.retrieve(response.id)
+    assert response.status == "cancelled"
+
+
+@pytest.mark.asyncio
+async def test_cancel_completed(client: openai.AsyncOpenAI):
+    response = await client.responses.create(input="Hello")
+    assert response.status == "completed"
+
+    with pytest.raises(openai.BadRequestError,
+                       match="Cannot cancel a synchronous response."):
+        await client.responses.cancel(response.id)
+
+
+@pytest.mark.asyncio
+async def test_previous_response_id(client: openai.AsyncOpenAI):
+    response1 = await client.responses.create(
+        instructions="You are tested on your ability to retrieve the correct "
+        "information from the previous response.",
+        input="Hello, my name is John.")
+
+    response2 = await client.responses.create(
+        input="Actually, my name is not John. My real name is Mark.",
+        previous_response_id=response1.id,
+    )
+
+    response3 = await client.responses.create(
+        input="What is my real name again? Answer in one word.",
+        previous_response_id=response2.id,
+    )
+    print(response3)
+    assert "Mark" in response3.output[-1].content[0].text
+    assert "John" not in response3.output[-1].content[0].text
+
+
+@pytest.mark.asyncio
+async def test_two_responses_with_same_prev_id(client: openai.AsyncOpenAI):
+    response1 = await client.responses.create(
+        instructions="You are tested on your ability to retrieve the correct "
+        "information from the previous response.",
+        input="Hello, my name is John.")
+
+    # Both response 2 and 3 use response 1 as the previous response.
+    response2 = client.responses.create(
+        input="Actually, my name is not John. My name is Mark.",
+        previous_response_id=response1.id,
+    )
+    response3 = client.responses.create(
+        input="What is my name again? Answer in one word.",
+        previous_response_id=response1.id,
+    )
+
+    _ = await response2
+    response3_result = await response3
+    print(response3_result)
+    assert "John" in response3_result.output[-1].content[0].text
+    assert "Mark" not in response3_result.output[-1].content[0].text
diff --git a/tests/v1/entrypoints/openai/responses/test_structured_output.py b/tests/v1/entrypoints/openai/responses/test_structured_output.py
new file mode 100644
index 0000000000000..c4c43a87b601a
--- /dev/null
+++ b/tests/v1/entrypoints/openai/responses/test_structured_output.py
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+
+import openai
+import pytest
+from pydantic import BaseModel
+
+
+@pytest.mark.asyncio
+async def test_structured_output(client: openai.AsyncOpenAI):
+    response = await client.responses.create(
+        input=[
+            {
+                "role": "system",
+                "content": "Extract the event information."
+            },
+            {
+                "role": "user",
+                "content":
+                "Alice and Bob are going to a science fair on Friday.",
+            },
+        ],
+        text={
+            "format": {
+                "type": "json_schema",
+                "name": "calendar_event",
+                "schema": {
+                    "type": "object",
+                    "properties": {
+                        "event_name": {
+                            "type": "string"
+                        },
+                        "date": {
+                            "type": "string"
+                        },
+                        "participants": {
+                            "type": "array",
+                            "items": {
+                                "type": "string"
+                            }
+                        },
+                    },
+                    "required": ["event_name", "date", "participants"],
+                    "additionalProperties": False,
+                },
+                "description": "A calendar event.",
+                "strict": True,
+            }
+        },
+    )
+    print(response)
+
+    # NOTE: The JSON schema is applied to the output text, not reasoning.
+    output_text = response.output[-1].content[0].text
+    event = json.loads(output_text)
+
+    assert event["event_name"].lower() == "science fair"
+    assert event["date"] == "Friday"
+    participants = event["participants"]
+    assert len(participants) == 2
+    assert participants[0] == "Alice"
+    assert participants[1] == "Bob"
+
+
+@pytest.mark.asyncio
+async def test_structured_output_with_parse(client: openai.AsyncOpenAI):
+
+    class CalendarEvent(BaseModel):
+        event_name: str
+        date: str
+        participants: list[str]
+
+    response = await client.responses.parse(
+        model=None,
+        instructions="Extract the event information.",
+        input="Alice and Bob are going to a science fair on Friday.",
+        text_format=CalendarEvent,
+    )
+    print(response)
+
+    # The output is successfully parsed.
+    event = response.output_parsed
+    assert event is not None
+
+    # The output is correct.
+    assert event.event_name.lower() == "science fair"
+    assert event.date == "Friday"
+    participants = event.participants
+    assert len(participants) == 2
+    assert participants[0] == "Alice"
+    assert participants[1] == "Bob"
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 4b6c50526b105..012ea1d75f44e 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -902,6 +902,8 @@ MM_PARSER_MAP: dict[
 ] = {
     "text":
     lambda part: _TextParser(part).get("text", None),
+    "input_text":
+    lambda part: _TextParser(part).get("text", None),
     "image_url":
     lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
     "image_embeds":
@@ -1040,7 +1042,7 @@ def _parse_chat_message_content_part(
             "with empty / unparsable content.", part, part_type)
         return None
 
-    if part_type in ("text", "refusal"):
+    if part_type in ("text", "input_text", "refusal"):
         str_content = cast(str, content)
         if wrap_dicts:
             return {'type': 'text', 'text': str_content}
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 6c0a95ebb1ee7..d3b1a3802bba1 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -69,8 +69,9 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               PoolingCompletionRequest,
                                               PoolingRequest, PoolingResponse,
                                               RerankRequest, RerankResponse,
-                                              ScoreRequest, ScoreResponse,
-                                              TokenizeRequest,
+                                              ResponsesRequest,
+                                              ResponsesResponse, ScoreRequest,
+                                              ScoreResponse, TokenizeRequest,
                                               TokenizeResponse,
                                               TranscriptionRequest,
                                               TranscriptionResponse,
@@ -87,6 +88,7 @@ from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import (BaseModelPath,
                                                     OpenAIServingModels)
 from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling
+from vllm.entrypoints.openai.serving_responses import OpenAIServingResponses
 from vllm.entrypoints.openai.serving_score import ServingScores
 from vllm.entrypoints.openai.serving_tokenization import (
     OpenAIServingTokenization)
@@ -368,6 +370,10 @@ def models(request: Request) -> OpenAIServingModels:
     return request.app.state.openai_serving_models
 
 
+def responses(request: Request) -> Optional[OpenAIServingResponses]:
+    return request.app.state.openai_serving_responses
+
+
 def chat(request: Request) -> Optional[OpenAIServingChat]:
     return request.app.state.openai_serving_chat
 
@@ -531,6 +537,71 @@ async def show_version():
     return JSONResponse(content=ver)
 
 
+@router.post("/v1/responses",
+             dependencies=[Depends(validate_json_request)],
+             responses={
+                 HTTPStatus.OK.value: {
+                     "content": {
+                         "text/event-stream": {}
+                     }
+                 },
+                 HTTPStatus.BAD_REQUEST.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.NOT_FOUND.value: {
+                     "model": ErrorResponse
+                 },
+                 HTTPStatus.INTERNAL_SERVER_ERROR.value: {
+                     "model": ErrorResponse
+                 },
+             })
+@with_cancellation
+async def create_responses(request: ResponsesRequest, raw_request: Request):
+    handler = responses(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Responses API")
+
+    generator = await handler.create_responses(request, raw_request)
+
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    elif isinstance(generator, ResponsesResponse):
+        return JSONResponse(content=generator.model_dump())
+    return StreamingResponse(content=generator, media_type="text/event-stream")
+
+
+@router.get("/v1/responses/{response_id}")
+async def retrieve_responses(response_id: str, raw_request: Request):
+    handler = responses(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Responses API")
+
+    response = await handler.retrieve_responses(response_id)
+
+    if isinstance(response, ErrorResponse):
+        return JSONResponse(content=response.model_dump(),
+                            status_code=response.code)
+    return JSONResponse(content=response.model_dump())
+
+
+@router.post("/v1/responses/{response_id}/cancel")
+async def cancel_responses(response_id: str, raw_request: Request):
+    handler = responses(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Responses API")
+
+    response = await handler.cancel_responses(response_id)
+
+    if isinstance(response, ErrorResponse):
+        return JSONResponse(content=response.model_dump(),
+                            status_code=response.code)
+    return JSONResponse(content=response.model_dump())
+
+
 @router.post("/v1/chat/completions",
              dependencies=[Depends(validate_json_request)],
              responses={
@@ -1272,6 +1343,22 @@ async def init_app_state(
         prompt_adapters=args.prompt_adapters,
     )
     await state.openai_serving_models.init_static_loras()
+    state.openai_serving_responses = OpenAIServingResponses(
+        engine_client,
+        model_config,
+        state.openai_serving_models,
+        request_logger=request_logger,
+        chat_template=resolved_chat_template,
+        chat_template_content_format=args.chat_template_content_format,
+        return_tokens_as_token_ids=args.return_tokens_as_token_ids,
+        enable_auto_tools=args.enable_auto_tool_choice,
+        expand_tools_even_if_tool_choice_none=args.
+        expand_tools_even_if_tool_choice_none,
+        tool_parser=args.tool_call_parser,
+        reasoning_parser=args.reasoning_parser,
+        enable_prompt_tokens_details=args.enable_prompt_tokens_details,
+        enable_force_include_usage=args.enable_force_include_usage,
+    ) if model_config.runner_type == "generate" else None
     state.openai_serving_chat = OpenAIServingChat(
         engine_client,
         model_config,
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index d4db238f456e3..14b2253d1dba7 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -11,6 +11,12 @@ from typing import Annotated, Any, ClassVar, Literal, Optional, Union
 import regex as re
 import torch
 from fastapi import HTTPException, UploadFile
+from openai.types.responses import (ResponseInputParam, ResponseOutputItem,
+                                    ResponseOutputMessage, ResponsePrompt,
+                                    ResponseStatus, ResponseTextConfig)
+from openai.types.responses.response import ToolChoice
+from openai.types.responses.tool import Tool
+from openai.types.shared import Metadata, Reasoning
 from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
                       ValidationInfo, field_validator, model_validator)
 from typing_extensions import TypeAlias
@@ -220,6 +226,124 @@ def get_logits_processors(processors: Optional[LogitsProcessors],
     return None
 
 
+class ResponsesRequest(OpenAIBaseModel):
+    # Ordered by official OpenAI API documentation
+    # https://platform.openai.com/docs/api-reference/responses/create
+    background: Optional[bool] = False
+    include: Optional[list[
+        Literal[
+            "code_interpreter_call.outputs",
+            "computer_call_output.output.image_url",
+            "file_search_call.results",
+            "message.input_image.image_url",
+            "message.output_text.logprobs",
+            "reasoning.encrypted_content",
+        ],
+    ]] = None
+    input: Union[str, ResponseInputParam]
+    instructions: Optional[str] = None
+    max_output_tokens: Optional[int] = None
+    max_tool_calls: Optional[int] = None
+    metadata: Optional[Metadata] = None
+    model: Optional[str] = None
+    parallel_tool_calls: Optional[bool] = True
+    previous_response_id: Optional[str] = None
+    prompt: Optional[ResponsePrompt] = None
+    reasoning: Optional[Reasoning] = None
+    service_tier: Literal["auto", "default", "flex", "scale",
+                          "priority"] = "auto"
+    store: Optional[bool] = True
+    stream: Optional[bool] = False
+    temperature: Optional[float] = None
+    text: Optional[ResponseTextConfig] = None
+    tool_choice: ToolChoice = "auto"
+    tools: list[Tool] = Field(default_factory=list)
+    top_logprobs: Optional[int] = 0
+    top_p: Optional[float] = None
+    truncation: Optional[Literal["auto", "disabled"]] = "disabled"
+    user: Optional[str] = None
+
+    # --8<-- [start:responses-extra-params]
+    request_id: str = Field(
+        default_factory=lambda: f"resp_{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."),
+    )
+    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."),
+    )
+    # --8<-- [end:responses-extra-params]
+
+    _DEFAULT_SAMPLING_PARAMS = {
+        "temperature": 1.0,
+        "top_p": 1.0,
+    }
+
+    def to_sampling_params(
+        self,
+        default_max_tokens: int,
+        default_sampling_params: Optional[dict] = None,
+    ) -> SamplingParams:
+        if self.max_output_tokens is None:
+            max_tokens = default_max_tokens
+        else:
+            max_tokens = min(self.max_output_tokens, default_max_tokens)
+
+        default_sampling_params = default_sampling_params or {}
+        if (temperature := self.temperature) is None:
+            temperature = default_sampling_params.get(
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
+        if (top_p := self.top_p) is None:
+            top_p = default_sampling_params.get(
+                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
+
+        # Structured output
+        guided_decoding = None
+        if self.text is not None and self.text.format is not None:
+            response_format = self.text.format
+            if response_format.type == "json_schema":
+                guided_decoding = GuidedDecodingParams.from_optional(
+                    json=response_format.schema_)
+            elif response_format.type == "json_object":
+                raise NotImplementedError("json_object is not supported")
+
+        # TODO: add more parameters
+        return SamplingParams.from_optional(
+            temperature=temperature,
+            top_p=top_p,
+            max_tokens=max_tokens,
+            logprobs=self.top_logprobs,
+            output_kind=(RequestOutputKind.DELTA
+                         if self.stream else RequestOutputKind.FINAL_ONLY),
+            guided_decoding=guided_decoding,
+        )
+
+    @model_validator(mode="before")
+    def validate_background(cls, data):
+        if not data.get("background"):
+            return data
+        if not data.get("store", True):
+            raise ValueError(
+                "background can only be used when `store` is true")
+        return data
+
+    @model_validator(mode="before")
+    def validate_prompt(cls, data):
+        if data.get("prompt") is not None:
+            raise ValueError("prompt template is not supported")
+        return data
+
+
 class ChatCompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/chat/create
@@ -1473,6 +1597,83 @@ class TranscriptionStreamResponse(OpenAIBaseModel):
     usage: Optional[UsageInfo] = Field(default=None)
 
 
+class ResponseReasoningItem(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"rs_{random_uuid()}")
+    text: str
+    summary: list = Field(default_factory=list)
+    type: Literal["reasoning"] = "reasoning"
+    encrypted_content: Optional[str] = None
+    status: Optional[Literal["in_progress", "completed", "incomplete"]]
+
+
+class ResponsesResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"resp_{random_uuid()}")
+    created_at: int = Field(default_factory=lambda: int(time.time()))
+    # error: Optional[ResponseError] = None
+    # incomplete_details: Optional[IncompleteDetails] = None
+    instructions: Optional[str] = None
+    metadata: Optional[Metadata] = None
+    model: str
+    object: Literal["response"] = "response"
+    output: list[Union[ResponseOutputMessage, ResponseReasoningItem]]
+    parallel_tool_calls: bool
+    temperature: float
+    tool_choice: ToolChoice
+    tools: list[Tool]
+    top_p: float
+    background: bool
+    max_output_tokens: int
+    max_tool_calls: Optional[int] = None
+    previous_response_id: Optional[str] = None
+    prompt: Optional[ResponsePrompt] = None
+    reasoning: Optional[Reasoning] = None
+    service_tier: Literal["auto", "default", "flex", "scale", "priority"]
+    status: ResponseStatus
+    text: Optional[ResponseTextConfig] = None
+    top_logprobs: int
+    truncation: Literal["auto", "disabled"]
+    usage: Optional[UsageInfo] = None
+    user: Optional[str] = None
+
+    @classmethod
+    def from_request(
+        cls,
+        request: ResponsesRequest,
+        sampling_params: SamplingParams,
+        model_name: str,
+        created_time: int,
+        output: list[ResponseOutputItem],
+        status: ResponseStatus,
+        usage: Optional[UsageInfo] = None,
+    ) -> "ResponsesResponse":
+        return cls(
+            id=request.request_id,
+            created_at=created_time,
+            instructions=request.instructions,
+            metadata=request.metadata,
+            model=model_name,
+            output=output,
+            parallel_tool_calls=request.parallel_tool_calls,
+            temperature=sampling_params.temperature,
+            tool_choice=request.tool_choice,
+            tools=request.tools,
+            top_p=sampling_params.top_p,
+            background=request.background,
+            max_output_tokens=sampling_params.max_tokens,
+            max_tool_calls=request.max_tool_calls,
+            previous_response_id=request.previous_response_id,
+            prompt=request.prompt,
+            reasoning=request.reasoning,
+            service_tier=request.service_tier,
+            status=status,
+            text=request.text,
+            top_logprobs=sampling_params.logprobs,
+            truncation=request.truncation,
+            user=request.user,
+            usage=usage,
+        )
+
+
 BatchRequestInputBody = Union[ChatCompletionRequest, EmbeddingRequest,
                               ScoreRequest, RerankRequest]
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index cf2b738ba55e4..c4ebb7141d09d 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -53,7 +53,8 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               EmbeddingRequest,
                                               EmbeddingResponse, ErrorResponse,
                                               PoolingResponse, RerankRequest,
-                                              ScoreRequest, ScoreResponse,
+                                              ResponsesRequest, ScoreRequest,
+                                              ScoreResponse,
                                               TokenizeChatRequest,
                                               TokenizeCompletionRequest,
                                               TokenizeResponse,
@@ -91,7 +92,8 @@ CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest,
 ChatLikeRequest = Union[ChatCompletionRequest, EmbeddingChatRequest,
                         TokenizeChatRequest]
 SpeechToTextRequest = Union[TranscriptionRequest, TranslationRequest]
-AnyRequest = Union[CompletionLikeRequest, ChatLikeRequest, SpeechToTextRequest]
+AnyRequest = Union[CompletionLikeRequest, ChatLikeRequest, SpeechToTextRequest,
+                   ResponsesRequest]
 
 AnyResponse = Union[
     CompletionResponse,
@@ -762,7 +764,7 @@ class OpenAIServing:
 
     async def _preprocess_chat(
         self,
-        request: ChatLikeRequest,
+        request: Union[ChatLikeRequest, ResponsesRequest],
         tokenizer: AnyTokenizer,
         messages: list[ChatCompletionMessageParam],
         chat_template: Optional[str],
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
new file mode 100644
index 0000000000000..ac2b3dfafec37
--- /dev/null
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -0,0 +1,464 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import time
+from collections.abc import AsyncGenerator, AsyncIterator
+from http import HTTPStatus
+from typing import Callable, Final, Optional, Union
+
+import jinja2
+from fastapi import Request
+from openai.types.responses import ResponseOutputMessage, ResponseOutputText
+
+from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
+                                         ChatTemplateContentFormatOption)
+from vllm.entrypoints.logger import RequestLogger
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.entrypoints.openai.protocol import (ErrorResponse,
+                                              PromptTokenUsageInfo,
+                                              RequestResponseMetadata,
+                                              ResponseReasoningItem,
+                                              ResponsesRequest,
+                                              ResponsesResponse, UsageInfo)
+# yapf: enable
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.logger import init_logger
+from vllm.outputs import RequestOutput
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import random_uuid
+
+logger = init_logger(__name__)
+
+
+class OpenAIServingResponses(OpenAIServing):
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        model_config: ModelConfig,
+        models: OpenAIServingModels,
+        *,
+        request_logger: Optional[RequestLogger],
+        chat_template: Optional[str],
+        chat_template_content_format: ChatTemplateContentFormatOption,
+        return_tokens_as_token_ids: bool = False,
+        reasoning_parser: str = "",
+        enable_auto_tools: bool = False,
+        expand_tools_even_if_tool_choice_none: bool = False,
+        tool_parser: Optional[str] = None,
+        enable_prompt_tokens_details: bool = False,
+        enable_force_include_usage: bool = False,
+    ) -> None:
+        super().__init__(
+            engine_client=engine_client,
+            model_config=model_config,
+            models=models,
+            request_logger=request_logger,
+            return_tokens_as_token_ids=return_tokens_as_token_ids,
+            enable_force_include_usage=enable_force_include_usage,
+        )
+
+        self.chat_template = chat_template
+        self.chat_template_content_format: Final = chat_template_content_format
+
+        self.reasoning_parser: Optional[Callable[[AnyTokenizer],
+                                                 ReasoningParser]] = None
+        if reasoning_parser:
+            try:
+                self.reasoning_parser = (
+                    ReasoningParserManager.get_reasoning_parser(
+                        reasoning_parser))
+                assert self.reasoning_parser is not None
+            except Exception as e:
+                raise TypeError(
+                    f"{reasoning_parser=} has not been registered") from e
+
+        self.enable_prompt_tokens_details = enable_prompt_tokens_details
+        self.enable_force_include_usage = enable_force_include_usage
+        self.default_sampling_params = (
+            self.model_config.get_diff_sampling_param())
+        if self.default_sampling_params:
+            source = self.model_config.generation_config
+            source = "model" if source == "auto" else source
+            logger.info("Using default chat sampling params from %s: %s",
+                        source, self.default_sampling_params)
+
+        # HACK(woosuk): This is a hack. We should use a better store.
+        # FIXME: This causes a memory leak since we never remove responses
+        # from the store.
+        self.response_store: dict[str, ResponsesResponse] = {}
+        self.response_store_lock = asyncio.Lock()
+
+        # HACK(woosuk): This is a hack. We should use a better store.
+        # FIXME: This causes a memory leak since we never remove messages
+        # from the store.
+        self.msg_store: dict[str, list[ChatCompletionMessageParam]] = {}
+
+        self.background_tasks: dict[str, asyncio.Task] = {}
+
+    async def create_responses(
+        self,
+        request: ResponsesRequest,
+        raw_request: Optional[Request] = None,
+    ) -> Union[AsyncGenerator[str, None], ResponsesResponse, ErrorResponse]:
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            logger.error("Error with model %s", error_check_ret)
+            return error_check_ret
+
+        # If the engine is dead, raise the engine's DEAD_ERROR.
+        # This is required for the streaming case, where we return a
+        # success status before we actually start generating text :).
+        if self.engine_client.errored:
+            raise self.engine_client.dead_error
+
+        # Handle the previous response ID.
+        prev_response_id = request.previous_response_id
+        if prev_response_id is not None:
+            if not prev_response_id.startswith("resp_"):
+                return self._make_invalid_id_error(prev_response_id)
+            async with self.response_store_lock:
+                prev_response = self.response_store.get(prev_response_id)
+            if prev_response is None:
+                return self._make_not_found_error(prev_response_id)
+        else:
+            prev_response = None
+        # Construct the input messages.
+        messages = self._construct_input_messages(request, prev_response)
+
+        try:
+            (
+                lora_request,
+                prompt_adapter_request,
+            ) = self._maybe_get_adapters(request)
+            model_name = self._get_model_name(request.model, lora_request)
+            tokenizer = await self.engine_client.get_tokenizer(lora_request)
+
+            _, request_prompts, engine_prompts = await self._preprocess_chat(
+                request,
+                tokenizer,
+                messages,
+                chat_template=self.chat_template,
+                chat_template_content_format=self.chat_template_content_format,
+            )
+        except (ValueError, TypeError, RuntimeError,
+                jinja2.TemplateError) as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(f"{e} {e.__cause__}")
+
+        request_metadata = RequestResponseMetadata(
+            request_id=request.request_id)
+        if raw_request:
+            raw_request.state.request_metadata = request_metadata
+
+        # Schedule the request and get the result generator.
+        generators: list[AsyncGenerator[RequestOutput, None]] = []
+        try:
+            for i, engine_prompt in enumerate(engine_prompts):
+                default_max_tokens = self.max_model_len - len(
+                    engine_prompt["prompt_token_ids"])
+                sampling_params = request.to_sampling_params(
+                    default_max_tokens, self.default_sampling_params)
+
+                self._log_inputs(request.request_id,
+                                 request_prompts[i],
+                                 params=sampling_params,
+                                 lora_request=lora_request,
+                                 prompt_adapter_request=prompt_adapter_request)
+
+                trace_headers = (None if raw_request is None else await
+                                 self._get_trace_headers(raw_request.headers))
+
+                generator = self.engine_client.generate(
+                    engine_prompt,
+                    sampling_params,
+                    request.request_id,
+                    lora_request=lora_request,
+                    trace_headers=trace_headers,
+                    prompt_adapter_request=prompt_adapter_request,
+                    priority=request.priority,
+                )
+                generators.append(generator)
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+        assert len(generators) == 1
+        result_generator, = generators
+
+        # Store the input messages.
+        if request.store:
+            self.msg_store[request.request_id] = messages
+
+        if request.background:
+            created_time = int(time.time())
+            response = ResponsesResponse.from_request(
+                request,
+                sampling_params,
+                model_name=model_name,
+                created_time=created_time,
+                output=[],
+                status="queued",
+                usage=None,
+            )
+            async with self.response_store_lock:
+                self.response_store[response.id] = response
+
+            # Run the request in the background.
+            task = asyncio.create_task(
+                self._run_background_request(
+                    request,
+                    sampling_params,
+                    result_generator,
+                    model_name,
+                    tokenizer,
+                    request_metadata,
+                    created_time,
+                ),
+                name=f"create_{response.id}",
+            )
+
+            # For cleanup.
+            response_id = response.id
+            self.background_tasks[response_id] = task
+            task.add_done_callback(
+                lambda _: self.background_tasks.pop(response_id, None))
+            return response
+
+        if request.stream:
+            raise NotImplementedError("Streaming responses are not supported")
+
+        try:
+            return await self.responses_full_generator(
+                request,
+                sampling_params,
+                result_generator,
+                model_name,
+                tokenizer,
+                request_metadata,
+            )
+        except Exception as e:
+            return self.create_error_response(str(e))
+
+    async def responses_full_generator(
+        self,
+        request: ResponsesRequest,
+        sampling_params: SamplingParams,
+        result_generator: AsyncIterator[RequestOutput],
+        model_name: str,
+        tokenizer: AnyTokenizer,
+        request_metadata: RequestResponseMetadata,
+        created_time: Optional[int] = None,
+    ) -> Union[ErrorResponse, ResponsesResponse]:
+        if created_time is None:
+            created_time = int(time.time())
+        final_res: Optional[RequestOutput] = None
+
+        try:
+            async for res in result_generator:
+                final_res = res
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+        assert final_res is not None
+        assert len(final_res.outputs) == 1
+        final_output = final_res.outputs[0]
+
+        if self.reasoning_parser:
+            try:
+                reasoning_parser = self.reasoning_parser(tokenizer)
+            except RuntimeError as e:
+                logger.exception("Error in reasoning parser creation.")
+                return self.create_error_response(str(e))
+
+            reasoning_content, content = (
+                reasoning_parser.extract_reasoning_content(final_output.text,
+                                                           request=request))
+        else:
+            reasoning_content = None
+            content = final_output.text
+
+        output = []
+        if reasoning_content:
+            reasoning_item = ResponseReasoningItem(
+                text=reasoning_content,
+                status=None,  # NOTE: Only the last output item has status.
+            )
+            output.append(reasoning_item)
+        if content:
+            output_text = ResponseOutputText(
+                text=content,
+                annotations=[],  # TODO
+                type="output_text",
+                logprobs=None,  # TODO
+            )
+            message = ResponseOutputMessage(
+                id=f"msg_{random_uuid()}",
+                content=[output_text],
+                role="assistant",
+                status="completed",
+                type="message",
+            )
+            output.append(message)
+
+        # Calculate usage.
+        assert final_res.prompt_token_ids is not None
+        num_prompt_tokens = len(final_res.prompt_token_ids)
+        num_generated_tokens = len(final_output.token_ids)
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            completion_tokens=num_generated_tokens,
+            total_tokens=num_prompt_tokens + num_generated_tokens,
+        )
+        if self.enable_prompt_tokens_details and final_res.num_cached_tokens:
+            usage.prompt_tokens_details = PromptTokenUsageInfo(
+                cached_tokens=final_res.num_cached_tokens)
+        request_metadata.final_usage_info = usage
+
+        response = ResponsesResponse.from_request(
+            request,
+            sampling_params,
+            model_name=model_name,
+            created_time=created_time,
+            output=output,
+            status="completed",
+            usage=usage,
+        )
+
+        if request.store:
+            async with self.response_store_lock:
+                stored_response = self.response_store.get(response.id)
+                # If the response is already cancelled, don't update it.
+                if (stored_response is None
+                        or stored_response.status != "cancelled"):
+                    self.response_store[response.id] = response
+        return response
+
+    def _construct_input_messages(
+        self,
+        request: ResponsesRequest,
+        prev_response: Optional[ResponsesResponse] = None,
+    ) -> list[ChatCompletionMessageParam]:
+        messages: list[ChatCompletionMessageParam] = []
+        if request.instructions:
+            messages.append({
+                "role": "system",
+                "content": request.instructions,
+            })
+
+        # Prepend the conversation history.
+        if prev_response is not None:
+            # Add the previous messages.
+            prev_msg = self.msg_store[prev_response.id]
+            messages.extend(prev_msg)
+
+            # Add the previous output.
+            for output_item in prev_response.output:
+                # NOTE: We skip the reasoning output.
+                if isinstance(output_item, ResponseOutputMessage):
+                    for content in output_item.content:
+                        messages.append({
+                            "role": "assistant",
+                            "content": content.text,
+                        })
+
+        # Append the new input.
+        # Reponses API supports simple text inputs without chat format.
+        if isinstance(request.input, str):
+            messages.append({"role": "user", "content": request.input})
+        else:
+            messages.extend(request.input)  # type: ignore
+        return messages
+
+    async def _run_background_request(
+        self,
+        request: ResponsesRequest,
+        *args,
+        **kwargs,
+    ):
+        try:
+            response = await self.responses_full_generator(
+                request, *args, **kwargs)
+        except Exception as e:
+            logger.exception("Background request failed for %s",
+                             request.request_id)
+            response = self.create_error_response(str(e))
+
+        if isinstance(response, ErrorResponse):
+            # If the request has failed, update the status to "failed".
+            response_id = request.request_id
+            async with self.response_store_lock:
+                stored_response = self.response_store.get(response_id)
+                assert stored_response is not None
+                if stored_response.status not in ("completed", "cancelled"):
+                    stored_response.status = "failed"
+
+    async def retrieve_responses(
+        self,
+        response_id: str,
+    ) -> Union[ErrorResponse, ResponsesResponse]:
+        if not response_id.startswith("resp_"):
+            return self._make_invalid_id_error(response_id)
+
+        async with self.response_store_lock:
+            response = self.response_store.get(response_id)
+
+        if response is None:
+            return self._make_not_found_error(response_id)
+        return response
+
+    async def cancel_responses(
+        self,
+        response_id: str,
+    ) -> Union[ErrorResponse, ResponsesResponse]:
+        if not response_id.startswith("resp_"):
+            return self._make_invalid_id_error(response_id)
+
+        async with self.response_store_lock:
+            response = self.response_store.get(response_id)
+            if response is None:
+                return self._make_not_found_error(response_id)
+
+            prev_status = response.status
+            if prev_status not in ("queued", "in_progress"):
+                return self.create_error_response(
+                    err_type="invalid_request_error",
+                    message="Cannot cancel a synchronous response.",
+                )
+
+            # Update the status to "cancelled".
+            response.status = "cancelled"
+
+        # Abort the request.
+        if (task := self.background_tasks.get(response_id)):
+            task.cancel()
+            try:
+                await task
+            except asyncio.CancelledError:
+                logger.exception("Background task for %s was cancelled",
+                                 response_id)
+        return response
+
+    def _make_invalid_id_error(self, response_id: str) -> ErrorResponse:
+        return self.create_error_response(
+            err_type="invalid_request_error",
+            message=(f"Invalid 'response_id': '{response_id}'. "
+                     "Expected an ID that begins with 'resp'."),
+        )
+
+    def _make_not_found_error(self, response_id: str) -> ErrorResponse:
+        return self.create_error_response(
+            err_type="invalid_request_error",
+            message=f"Response with id '{response_id}' not found.",
+            status_code=HTTPStatus.NOT_FOUND,
+        )
diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
index e827d381ca1d2..c34189013d990 100644
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -10,7 +10,7 @@ from functools import cached_property
 from typing import Callable, Optional, Union
 
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaMessage)
+                                              DeltaMessage, ResponsesRequest)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import import_from_path, is_list_of
@@ -66,7 +66,9 @@ class ReasoningParser:
 
     @abstractmethod
     def extract_reasoning_content(
-            self, model_output: str, request: ChatCompletionRequest
+        self,
+        model_output: str,
+        request: Union[ChatCompletionRequest, ResponsesRequest],
     ) -> tuple[Optional[str], Optional[str]]:
         """
         Extract reasoning content from a complete model-generated string.

From 47db8c2c15209ca03dc57422d98518ca0199e657 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Mon, 7 Jul 2025 10:42:06 +0800
Subject: [PATCH 122/167] [Misc] add a tip for pre-commit (#20536)

Signed-off-by: reidliu41 <reid201711@gmail.com>
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d962252eb3dd8..720c06acf144d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -170,7 +170,7 @@ repos:
   # Keep `suggestion` last
   - id: suggestion
     name: Suggestion
-    entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
+    entry: bash -c 'echo "To bypass all the pre-commit hooks, add --no-verify to git commit. To skip a specific hook, prefix the commit command with SKIP=<hook-id>."'
     language: system
     verbose: true
     pass_filenames: false

From 6e2c19ce227ecf285ed24a138b91570b3a2d57a6 Mon Sep 17 00:00:00 2001
From: Yang Yang <yang5.yang@intel.com>
Date: Mon, 7 Jul 2025 12:32:32 +0800
Subject: [PATCH 123/167] [Refactor]Abstract Platform Interface for Distributed
 Backend and Add xccl Support for Intel XPU (#19410)

Signed-off-by: dbyoung18 <yang5.yang@intel.com>
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
---
 docs/getting_started/installation/gpu/xpu.inc.md |  5 +++++
 vllm/platforms/__init__.py                       | 13 +++++++++++--
 vllm/platforms/cpu.py                            |  1 +
 vllm/platforms/cuda.py                           |  1 +
 vllm/platforms/hpu.py                            |  1 +
 vllm/platforms/interface.py                      |  3 +++
 vllm/platforms/neuron.py                         |  1 +
 vllm/platforms/rocm.py                           |  1 +
 vllm/platforms/tpu.py                            |  1 +
 vllm/platforms/xpu.py                            |  1 +
 vllm/utils/__init__.py                           |  6 ++++++
 vllm/v1/worker/cpu_worker.py                     |  4 +++-
 vllm/v1/worker/gpu_worker.py                     |  3 ++-
 vllm/v1/worker/tpu_worker.py                     |  3 ++-
 vllm/worker/hpu_worker.py                        |  3 ++-
 vllm/worker/neuron_worker.py                     |  2 +-
 vllm/worker/worker.py                            |  3 ++-
 17 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/docs/getting_started/installation/gpu/xpu.inc.md b/docs/getting_started/installation/gpu/xpu.inc.md
index 4469be36c0075..1514a0c2d3cd4 100644
--- a/docs/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/getting_started/installation/gpu/xpu.inc.md
@@ -81,4 +81,9 @@ python -m vllm.entrypoints.openai.api_server \
 By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
 
 # --8<-- [end:supported-features]
+# --8<-- [start:distributed-backend]
+
+XPU platform uses **torch-ccl** for torch<2.8 and **xccl** for torch>=2.8 as distributed backend, since torch 2.8 supports **xccl** as built-in backend for XPU.
+
+# --8<-- [end:distributed-backend]
 # --8<-- [end:extra-information]
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 13453d2c4b4b2..7b8953fd75bb0 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -7,7 +7,7 @@ from itertools import chain
 from typing import TYPE_CHECKING, Optional
 
 from vllm.plugins import load_plugins_by_group
-from vllm.utils import resolve_obj_by_qualname
+from vllm.utils import resolve_obj_by_qualname, supports_xccl
 
 from .interface import _Backend  # noqa: F401
 from .interface import CpuArchEnum, Platform, PlatformEnum
@@ -139,10 +139,19 @@ def xpu_platform_plugin() -> Optional[str]:
     try:
         # installed IPEX if the machine has XPUs.
         import intel_extension_for_pytorch  # noqa: F401
-        import oneccl_bindings_for_pytorch  # noqa: F401
         import torch
+        if supports_xccl():
+            dist_backend = "xccl"
+        else:
+            dist_backend = "ccl"
+            import oneccl_bindings_for_pytorch  # noqa: F401
+
         if hasattr(torch, 'xpu') and torch.xpu.is_available():
             is_xpu = True
+            from vllm.platforms.xpu import XPUPlatform
+            XPUPlatform.dist_backend = dist_backend
+            logger.debug("Confirmed %s backend is available.",
+                         XPUPlatform.dist_backend)
             logger.debug("Confirmed XPU platform is available.")
     except Exception as e:
         logger.debug("XPU platform is not available because: %s", str(e))
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 1050d3c593443..676a440a79db8 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -37,6 +37,7 @@ class CpuPlatform(Platform):
     device_name: str = "cpu"
     device_type: str = "cpu"
     dispatch_key: str = "CPU"
+    dist_backend: str = "gloo"
 
     @property
     def supported_dtypes(self) -> list[torch.dtype]:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 0a5f4004e4488..50eedfa3c412f 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -56,6 +56,7 @@ class CudaPlatformBase(Platform):
     device_type: str = "cuda"
     dispatch_key: str = "CUDA"
     ray_device_key: str = "GPU"
+    dist_backend: str = "nccl"
     device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
 
     @property
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 3cf28950190c8..0b1e2f2327901 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -26,6 +26,7 @@ class HpuPlatform(Platform):
     device_type: str = "hpu"
     dispatch_key: str = "HPU"
     ray_device_key: str = "HPU"
+    dist_backend: str = "hccl"
     device_control_env_var: str = "HABANA_VISIBLE_MODULES"
 
     @classmethod
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 567d5cbf503fe..b0ef9905481b4 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -129,6 +129,9 @@ class Platform:
     # compilation strategy.
     simple_compile_backend: str = "inductor"
 
+    # The backend used for distributed communication.
+    dist_backend: str = ""
+
     supported_quantization: list[str] = []
 
     additional_env_vars: list[str] = []
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index 04e918d7aebee..cb8ac8db669fe 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -30,6 +30,7 @@ class NeuronPlatform(Platform):
     device_type: str = "neuron"
     ray_device_key: str = "neuron_cores"
     supported_quantization: list[str] = ["neuron_quant", "fbgemm_fp8"]
+    dist_backend: str = "gloo"
     device_control_env_var: str = "NEURON_RT_VISIBLE_CORES"
 
     @classmethod
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 4550ef570684b..31f4699cd1b0c 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -164,6 +164,7 @@ class RocmPlatform(Platform):
     device_type: str = "cuda"
     dispatch_key: str = "CUDA"
     ray_device_key: str = "GPU"
+    dist_backend: str = "nccl"
     # rocm shares the same device control env var as CUDA
     device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
 
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index a8c8cb46de2cc..6810944c848d7 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -31,6 +31,7 @@ class TpuPlatform(Platform):
     device_type: str = "tpu"
     dispatch_key: str = "XLA"
     ray_device_key: str = "TPU"
+    dist_backend: str = "gloo"
     device_control_env_var: str = "TPU_VISIBLE_CHIPS"
     simple_compile_backend: str = "openxla"
 
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 5bd34033233ab..de715fd894c33 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -29,6 +29,7 @@ class XPUPlatform(Platform):
     # Intel XPU's device key is "GPU" for Ray.
     # see https://github.com/ray-project/ray/blob/6a5eb5865eeb9ccf058a79b44f107e327e360673/python/ray/_private/accelerators/intel_gpu.py#L20 # noqa: E501
     ray_device_key: str = "GPU"
+    dist_backend: str = "ccl"  # ccl | xccl
     device_control_env_var: str = "ONEAPI_DEVICE_SELECTOR"
 
     @classmethod
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 9550b056fbba9..9322e3cc477a0 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -1886,6 +1886,12 @@ def supports_dynamo() -> bool:
     return base_torch_version >= Version("2.4.0")
 
 
+# Supports xccl with PyTorch versions >= 2.8.0 for XPU platform
+def supports_xccl() -> bool:
+    return is_torch_equal_or_newer(
+        "2.8.0") and torch.distributed.is_xccl_available()
+
+
 # Some backends use pytorch version < 2.4.0 which doesn't
 # support `torch.library.custom_op`.
 def supports_custom_op() -> bool:
diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py
index de575d604055f..7712b7974544f 100644
--- a/vllm/v1/worker/cpu_worker.py
+++ b/vllm/v1/worker/cpu_worker.py
@@ -11,6 +11,7 @@ from vllm.config import VllmConfig
 from vllm.distributed.parallel_state import get_pp_group, get_tp_group
 from vllm.logger import init_logger
 from vllm.model_executor.utils import set_random_seed
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.outputs import ModelRunnerOutput
@@ -58,7 +59,8 @@ class CPUWorker(Worker):
         # Initialize the distributed environment.
         init_worker_distributed_environment(self.vllm_config, self.rank,
                                             self.distributed_init_method,
-                                            self.local_rank, "gloo")
+                                            self.local_rank,
+                                            current_platform.dist_backend)
         # Set random seed.
         set_random_seed(self.model_config.seed)
 
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 9e7e44d068612..d1df0fd959b5e 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -157,7 +157,8 @@ class Worker(WorkerBase):
         # Initialize the distributed environment.
         init_worker_distributed_environment(self.vllm_config, self.rank,
                                             self.distributed_init_method,
-                                            self.local_rank)
+                                            self.local_rank,
+                                            current_platform.dist_backend)
         # Set random seed.
         set_random_seed(self.model_config.seed)
 
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index a64ce881fe318..ade4d08211683 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -18,6 +18,7 @@ from vllm.distributed import (ensure_model_parallel_initialized,
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
+from vllm.platforms import current_platform
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
 from vllm.v1.attention.backends.pallas import TPU_HEAD_SIZE_ALIGNMENT
 from vllm.v1.core.sched.output import SchedulerOutput
@@ -300,7 +301,7 @@ class TPUWorker:
             rank=rank,
             local_rank=local_rank,
             distributed_init_method=distributed_init_method,
-            backend="gloo",
+            backend=current_platform.dist_backend,
         )
         ensure_model_parallel_initialized(
             parallel_config.tensor_parallel_size,
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
index 6d76ea499a901..560110df0a322 100644
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -23,6 +23,7 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
 from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.platforms import current_platform
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import bind_kv_cache
@@ -413,7 +414,7 @@ def init_worker_distributed_environment(
                                  rank,
                                  distributed_init_method,
                                  local_rank,
-                                 backend='hccl')
+                                 backend=current_platform.dist_backend)
 
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
                                       parallel_config.pipeline_parallel_size)
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index 662bde6bc07b0..4e1408300fb8b 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -156,7 +156,7 @@ class NeuronWorker(LocalOrDistributedWorkerBase):
             rank=self.rank,
             local_rank=self.local_rank,
             distributed_init_method=self.distributed_init_method,
-            backend="gloo",
+            backend=current_platform.dist_backend,
         )
 
         ensure_model_parallel_initialized(
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 9a928632688a1..21e684a3fb5a0 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -530,7 +530,8 @@ def init_worker_distributed_environment(
     set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
 
     init_distributed_environment(parallel_config.world_size, rank,
-                                 distributed_init_method, local_rank)
+                                 distributed_init_method, local_rank,
+                                 current_platform.dist_backend)
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
                                       parallel_config.pipeline_parallel_size)
 

From 2e610deb72dfc1e34b904d9b6b02c85eefa451d2 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Mon, 7 Jul 2025 13:10:41 +0800
Subject: [PATCH 124/167] [CI/Build] Enable phi2 lora test (#20540)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_phi.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py
index 9d75512a248be..3090941e63679 100644
--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import pytest
-
 import vllm
 from vllm.lora.request import LoRARequest
 
@@ -49,9 +47,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     return generated_texts
 
 
-# Skipping for V1 for now as we are hitting,
-# "Head size 80 is not supported by FlashAttention." error.
-@pytest.mark.skip(reason="Head size 80 is not supported by FlashAttention")
 def test_phi2_lora(phi2_lora_files):
     # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
     # Otherwise, the lora-test will fail due to CUDA OOM.

From 2c5ebec064bf3684c8f02b70b5963615daa81b28 Mon Sep 17 00:00:00 2001
From: Liangliang Ma <liangliang.ma@intel.com>
Date: Mon, 7 Jul 2025 16:16:40 +0800
Subject: [PATCH 125/167] [XPU][CI] add v1/core test in xpu hardware ci
 (#20537)

Signed-off-by: Ma, Liangliang <liangliang.ma@intel.com>
---
 .buildkite/scripts/hardware_ci/run-xpu-test.sh | 6 ++++--
 docker/Dockerfile.xpu                          | 2 +-
 vllm/platforms/xpu.py                          | 6 +-----
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index cf3aaab8493ba..a23abdc1ed6c8 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -11,8 +11,8 @@ container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head
 docker build -t ${image_name} -f docker/Dockerfile.xpu .
 
 # Setup cleanup
-remove_docker_container() { 
-  docker rm -f "${container_name}" || true; 
+remove_docker_container() {
+  docker rm -f "${container_name}" || true;
   docker image rm -f "${image_name}" || true;
   docker system prune -f || true;
 }
@@ -27,4 +27,6 @@ docker run \
     "${image_name}" \
     sh -c '
     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    cd tests
+    pytest -v -s v1/core
 '
diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index 466ba98333635..41b4c42e4c4b1 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -47,7 +47,7 @@ FROM vllm-base AS vllm-openai
 
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer 'modelscope!=1.15.0'
+    pip install accelerate hf_transfer pytest 'modelscope!=1.15.0'
 
 ENV VLLM_USAGE_SOURCE production-docker-image \
     TRITON_XPU_PROFILE 1
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index de715fd894c33..39828d321edee 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -93,10 +93,6 @@ class XPUPlatform(Platform):
                     "mode.")
                 model_config.enforce_eager = True
 
-        if vllm_config.speculative_config is not None:
-            raise NotImplementedError(
-                "XPU does not support speculative decoding")
-
         if vllm_config.device_config is not None:
             assert vllm_config.device_config.device_type == "xpu"
 
@@ -181,4 +177,4 @@ class XPUPlatform(Platform):
 
     @classmethod
     def device_count(cls) -> int:
-        return torch.xpu.device_count()
\ No newline at end of file
+        return torch.xpu.device_count()

From 1fd471e957526a34a0cb4b60d2e830cd6ca79fdc Mon Sep 17 00:00:00 2001
From: Michael Yao <haifeng.yao@daocloud.io>
Date: Mon, 7 Jul 2025 16:31:49 +0800
Subject: [PATCH 126/167] Add docstrings to url_schemes.py to improve
 readability (#20545)

Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
---
 docs/mkdocs/hooks/url_schemes.py | 70 +++++++++++++++++++++++++++++++-
 1 file changed, 69 insertions(+), 1 deletion(-)

diff --git a/docs/mkdocs/hooks/url_schemes.py b/docs/mkdocs/hooks/url_schemes.py
index 6484581ed9478..6fce6bd8130e0 100644
--- a/docs/mkdocs/hooks/url_schemes.py
+++ b/docs/mkdocs/hooks/url_schemes.py
@@ -1,5 +1,24 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This is basically a port of MyST parser’s external URL resolution mechanism
+(https://myst-parser.readthedocs.io/en/latest/syntax/cross-referencing.html#customising-external-url-resolution)
+to work with MkDocs.
+
+It allows Markdown authors to use GitHub shorthand links like:
+
+  - [Text](gh-issue:123)
+  - <gh-pr:456>
+  - [File](gh-file:path/to/file.py#L10)
+
+These are automatically rewritten into fully qualified GitHub URLs pointing to
+issues, pull requests, files, directories, or projects in the
+`vllm-project/vllm` repository.
+
+The goal is to simplify cross-referencing common GitHub resources
+in project docs.
+"""
+
 import regex as re
 from mkdocs.config.defaults import MkDocsConfig
 from mkdocs.structure.files import Files
@@ -7,11 +26,42 @@ from mkdocs.structure.pages import Page
 
 
 def on_page_markdown(markdown: str, *, page: Page, config: MkDocsConfig,
-                     files: Files):
+                     files: Files) -> str:
+    """
+    Custom MkDocs plugin hook to rewrite special GitHub reference links
+    in Markdown.
+
+    This function scans the given Markdown content for specially formatted
+    GitHub shorthand links, such as:
+      - `[Link text](gh-issue:123)`
+      - `<gh-pr:456>`
+    
+    And rewrites them into fully-qualified GitHub URLs with GitHub icons:
+      - `[:octicons-mark-github-16: Link text](https://github.com/vllm-project/vllm/issues/123)`
+      - `[:octicons-mark-github-16: Pull Request #456](https://github.com/vllm-project/vllm/pull/456)`
+
+    Supported shorthand types:
+      - `gh-issue`
+      - `gh-pr`
+      - `gh-project`
+      - `gh-dir`
+      - `gh-file`
+
+    Args:
+        markdown (str): The raw Markdown content of the page.
+        page (Page): The MkDocs page object being processed.
+        config (MkDocsConfig): The MkDocs site configuration.
+        files (Files): The collection of files in the MkDocs build.
+
+    Returns:
+        str: The updated Markdown content with GitHub shorthand links replaced.
+    """
     gh_icon = ":octicons-mark-github-16:"
     gh_url = "https://github.com"
     repo_url = f"{gh_url}/vllm-project/vllm"
     org_url = f"{gh_url}/orgs/vllm-project"
+
+    # Mapping of shorthand types to their corresponding GitHub base URLs
     urls = {
         "issue": f"{repo_url}/issues",
         "pr": f"{repo_url}/pull",
@@ -19,6 +69,8 @@ def on_page_markdown(markdown: str, *, page: Page, config: MkDocsConfig,
         "dir": f"{repo_url}/tree/main",
         "file": f"{repo_url}/blob/main",
     }
+
+    # Default title prefixes for auto links
     titles = {
         "issue": "Issue #",
         "pr": "Pull Request #",
@@ -27,11 +79,19 @@ def on_page_markdown(markdown: str, *, page: Page, config: MkDocsConfig,
         "file": "",
     }
 
+    # Regular expression to match GitHub shorthand links
     scheme = r"gh-(?P<type>.+?):(?P<path>.+?)(#(?P<fragment>.+?))?"
     inline_link = re.compile(r"\[(?P<title>[^\[]+?)\]\(" + scheme + r"\)")
     auto_link = re.compile(f"<{scheme}>")
 
     def replace_inline_link(match: re.Match) -> str:
+        """
+        Replaces a matched inline-style GitHub shorthand link
+        with a full Markdown link.
+        
+        Example:
+            [My issue](gh-issue:123) → [:octicons-mark-github-16: My issue](https://github.com/vllm-project/vllm/issues/123)
+        """
         url = f'{urls[match.group("type")]}/{match.group("path")}'
         if fragment := match.group("fragment"):
             url += f"#{fragment}"
@@ -39,6 +99,13 @@ def on_page_markdown(markdown: str, *, page: Page, config: MkDocsConfig,
         return f'[{gh_icon} {match.group("title")}]({url})'
 
     def replace_auto_link(match: re.Match) -> str:
+        """
+        Replaces a matched autolink-style GitHub shorthand
+        with a full Markdown link.
+        
+        Example:
+            <gh-pr:456> → [:octicons-mark-github-16: Pull Request #456](https://github.com/vllm-project/vllm/pull/456)
+        """
         type = match.group("type")
         path = match.group("path")
         title = f"{titles[type]}{path}"
@@ -48,6 +115,7 @@ def on_page_markdown(markdown: str, *, page: Page, config: MkDocsConfig,
 
         return f"[{gh_icon} {title}]({url})"
 
+    # Replace both inline and autolinks
     markdown = inline_link.sub(replace_inline_link, markdown)
     markdown = auto_link.sub(replace_auto_link, markdown)
 

From 3112271f6e5d50b3d94a2efa88a5a8e77826b897 Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Mon, 7 Jul 2025 16:38:22 +0800
Subject: [PATCH 127/167] [XPU] log clean up for XPU platform (#20553)

Signed-off-by: yan <yan.ma@intel.com>
---
 vllm/_custom_ops.py   | 3 ++-
 vllm/platforms/xpu.py | 5 ++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index eb9d0b4058927..92db27f5b8dce 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -13,7 +13,8 @@ from vllm.scalar_type import ScalarType
 
 logger = init_logger(__name__)
 
-if not current_platform.is_tpu() and not current_platform.is_hpu():
+if not current_platform.is_tpu() and not current_platform.is_hpu()\
+        and not current_platform.is_xpu():
     try:
         import vllm._C
     except ImportError as e:
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 39828d321edee..e2871c1064926 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -37,7 +37,7 @@ class XPUPlatform(Platform):
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
                              block_size: int, use_v1: bool,
                              use_mla: bool) -> str:
-        if selected_backend != _Backend.IPEX:
+        if selected_backend is not None and selected_backend != _Backend.IPEX:
             logger.info("Cannot use %s backend on XPU.", selected_backend)
         use_v1 = envs.VLLM_USE_V1
         if not use_v1:
@@ -133,8 +133,7 @@ class XPUPlatform(Platform):
 
     @classmethod
     def is_pin_memory_available(cls):
-        logger.warning("Pin memory is not supported on XPU.")
-        return False
+        return True
 
     @classmethod
     def get_current_memory_usage(cls,

From eb0b2d2f08b622f4b93fb0a811a047ad987a46ca Mon Sep 17 00:00:00 2001
From: Michael Yao <haifeng.yao@daocloud.io>
Date: Mon, 7 Jul 2025 16:46:31 +0800
Subject: [PATCH 128/167] [Docs] Clean up tables in supported_models.md
 (#20552)

Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
---
 docs/models/supported_models.md | 320 ++++++++++++++++----------------
 1 file changed, 160 insertions(+), 160 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 7ec91df98b285..422c406d5f318 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -314,85 +314,85 @@ See [this page][generative-models] for more information on how to use generative
 
 Specified using `--task generate`.
 
-| Architecture                                      | Models                                              | Example HF Models                                                                                                                                                            | [LoRA][lora-adapter]   | [PP][distributed-serving]   | [V1](gh-issue:8779)   |
-|---------------------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|-----------------------|
-| `AquilaForCausalLM`                               | Aquila, Aquila2                                     | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.                                                                                                                                 | ✅︎                     | ✅︎                          | ✅︎                     |
-| `ArcticForCausalLM`                               | Arctic                                              | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc.                                                                                               |                        | ✅︎                          | ✅︎                     |
-| `BaiChuanForCausalLM`                             | Baichuan2, Baichuan                                 | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.                                                                                                          | ✅︎                     | ✅︎                          | ✅︎                     |
-| `BambaForCausalLM`                                | Bamba                                               | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B`                                                                                                                   | ✅︎                     | ✅︎                          |                       |
-| `BloomForCausalLM`                                | BLOOM, BLOOMZ, BLOOMChat                            | `bigscience/bloom`, `bigscience/bloomz`, etc.                                                                                                                                |                        | ✅︎                          |                       |
-| `BartForConditionalGeneration`                    | BART                                                | `facebook/bart-base`, `facebook/bart-large-cnn`, etc.                                                                                                                        |                        |                             |                       |
-| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM                                             | `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc.                                                                                                       | ✅︎                     | ✅︎                          | ✅︎                     |
-| `CohereForCausalLM`, `Cohere2ForCausalLM`         | Command-R                                           | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc.                                                                                               | ✅︎                     | ✅︎                          | ✅︎                     |
-| `DbrxForCausalLM`                                 | DBRX                                                | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc.                                                                                                                     |                        | ✅︎                          | ✅︎                     |
-| `DeciLMForCausalLM`                               | DeciLM                                              | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc.                                                                                                                               | ✅︎                     | ✅︎                          | ✅︎                     |
-| `DeepseekForCausalLM`                             | DeepSeek                                            | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc.                                                                                                 |                        | ✅︎                          | ✅︎                     |
-| `DeepseekV2ForCausalLM`                           | DeepSeek-V2                                         | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc.                                                                                                               |                        | ✅︎                          | ✅︎                     |
-| `DeepseekV3ForCausalLM`                           | DeepSeek-V3                                         | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc.                                                                                                               |                        | ✅︎                          | ✅︎                     |
-| `Dots1ForCausalLM`                                | dots.llm1                                           | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst` etc.                                                                                                               |                        | ✅︎                          | ✅︎                     |
-| `Ernie4_5_ForCausalLM`                            | Ernie4.5                                            | `baidu/ERNIE-4.5-0.3B-PT`,etc.                                                                                                               |                        | ✅︎                          | ✅︎                     |
-| `Ernie4_5_MoeForCausalLM`                         | Ernie4.5MoE                                         | `baidu/ERNIE-4.5-21B-A3B-PT`,  `baidu/ERNIE-4.5-300B-A47B-PT`, etc.                                                                                                               |                        | ✅︎                          | ✅︎                     |
-| `ExaoneForCausalLM`                               | EXAONE-3                                            | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc.                                                                                                                                 | ✅︎                     | ✅︎                          | ✅︎                     |
-| `FalconForCausalLM`                               | Falcon                                              | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.                                                                                                         |                        | ✅︎                          | ✅︎                     |
-| `FalconMambaForCausalLM`                          | FalconMamba                                         | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc.                                                                                                            |                        | ✅︎                          | ✅︎                     |
-| `FalconH1ForCausalLM`                             | Falcon-H1                                           | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc.                                                                                                           | ✅︎                     | ✅︎                          |                       |
-| `GemmaForCausalLM`                                | Gemma                                               | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc.                                                                                                                            | ✅︎                     | ✅︎                          | ✅︎                     |
-| `Gemma2ForCausalLM`                               | Gemma 2                                             | `google/gemma-2-9b`, `google/gemma-2-27b`, etc.                                                                                                                              | ✅︎                     | ✅︎                          | ✅︎                     |
-| `Gemma3ForCausalLM`                               | Gemma 3                                             | `google/gemma-3-1b-it`, etc.                                                                                                                                                 | ✅︎                     | ✅︎                          | ✅︎                     |
-| `Gemma3nForConditionalGeneration`                  | Gemma 3n                                             | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc.                                                                                                                                                 |                      |                           | ✅︎                     |
-| `GlmForCausalLM`                                  | GLM-4                                               | `THUDM/glm-4-9b-chat-hf`, etc.                                                                                                                                               | ✅︎                     | ✅︎                          | ✅︎                     |
-| `Glm4ForCausalLM`                                 | GLM-4-0414                                          | `THUDM/GLM-4-32B-0414`, etc.                                                                                                                                                 | ✅︎                     | ✅︎                          | ✅︎                     |
-| `GPT2LMHeadModel`                                 | GPT-2                                               | `gpt2`, `gpt2-xl`, etc.                                                                                                                                                      |                        | ✅︎                          | ✅︎                     |
-| `GPTBigCodeForCausalLM`                           | StarCoder, SantaCoder, WizardCoder                  | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc.                                                                                 | ✅︎                     | ✅︎                          | ✅︎                     |
-| `GPTJForCausalLM`                                 | GPT-J                                               | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.                                                                                                                            |                        | ✅︎                          | ✅︎                     |
-| `GPTNeoXForCausalLM`                              | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. |                        | ✅︎                          | ✅︎                     |
-| `GraniteForCausalLM`                              | Granite 3.0, Granite 3.1, PowerLM                   | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc.                                                                             | ✅︎                     | ✅︎                          | ✅︎                     |
-| `GraniteMoeForCausalLM`                           | Granite 3.0 MoE, PowerMoE                           | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc.                                                                | ✅︎                     | ✅︎                          | ✅︎                     |
-| `GraniteMoeHybridForCausalLM`                     | Granite 4.0 MoE Hybrid                              | `ibm-granite/granite-4.0-tiny-preview`, etc.                                                                                                                                 | ✅︎                     | ✅︎                          |                       |
-| `GraniteMoeSharedForCausalLM`                     | Granite MoE Shared                                  | `ibm-research/moe-7b-1b-active-shared-experts` (test model)                                                                                                                  | ✅︎                     | ✅︎                          | ✅︎                     |
-| `GritLM`                                          | GritLM                                              | `parasail-ai/GritLM-7B-vllm`.                                                                                                                                                | ✅︎                     | ✅︎                          |                       |
-| `Grok1ModelForCausalLM`                           | Grok1                                               | `hpcai-tech/grok-1`.                                                                                                                                                         | ✅︎                     | ✅︎                          | ✅︎                     |
-| `HunYuanMoEV1ForCausalLM`                         | Hunyuan-80B-A13B                                    | `tencent/Hunyuan-A13B-Instruct`,  `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`etc.                                                                  |                        |                             | ✅︎                     |
-| `InternLMForCausalLM`                             | InternLM                                            | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.                                                                                                                    | ✅︎                     | ✅︎                          | ✅︎                     |
-| `InternLM2ForCausalLM`                            | InternLM2                                           | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.                                                                                                                  | ✅︎                     | ✅︎                          | ✅︎                     |
-| `InternLM3ForCausalLM`                            | InternLM3                                           | `internlm/internlm3-8b-instruct`, etc.                                                                                                                                       | ✅︎                     | ✅︎                          | ✅︎                     |
-| `JAISLMHeadModel`                                 | Jais                                                | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc.                                                         |                        | ✅︎                          | ✅︎                     |
-| `JambaForCausalLM`                                | Jamba                                               | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc.                                                                                 | ✅︎                     | ✅︎                          |                       |
-| `LlamaForCausalLM`                                | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi              | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc.        | ✅︎                     | ✅︎                          | ✅︎                     |
-| `MambaForCausalLM`                                | Mamba                                               | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc.                                                                               |                        | ✅︎                          |                       |
-| `Mamba2ForCausalLM`                               | Mamba2                                              | `mistralai/Mamba-Codestral-7B-v0.1`, etc.                                                                                                                                   |                        | ✅︎                          |                       |
-| `MiniCPMForCausalLM`                              | MiniCPM                                             | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc.                                                                               | ✅︎                     | ✅︎                          | ✅︎                     |
-| `MiniCPM3ForCausalLM`                             | MiniCPM3                                            | `openbmb/MiniCPM3-4B`, etc.                                                                                                                                                  | ✅︎                     | ✅︎                          | ✅︎                     |
-| `MistralForCausalLM`                              | Mistral, Mistral-Instruct                           | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.                                                                                                      | ✅︎                     | ✅︎                          | ✅︎                     |
-| `MixtralForCausalLM`                              | Mixtral-8x7B, Mixtral-8x7B-Instruct                 | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.                                                          | ✅︎                     | ✅︎                          | ✅︎                     |
-| `MPTForCausalLM`                                  | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter        | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc.                                                                                                   |                        | ✅︎                          | ✅︎                     |
-| `NemotronForCausalLM`                             | Nemotron-3, Nemotron-4, Minitron                    | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc.                                                                                                         | ✅︎                     | ✅︎                          | ✅︎                     |
-| `NemotronHForCausalLM`                            | Nemotron-H                                          | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc.                                                                       | ✅︎                     | ✅︎                          |                       |
-| `OLMoForCausalLM`                                 | OLMo                                                | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.                                                                                                                             |                        | ✅︎                          | ✅︎                     |
-| `OLMo2ForCausalLM`                                | OLMo2                                               | `allenai/OLMo-2-0425-1B`, etc.                                                                                                                                               |                        | ✅︎                          | ✅︎                     |
-| `OLMoEForCausalLM`                                | OLMoE                                               | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc.                                                                                                        |                        | ✅︎                          | ✅︎                     |
-| `OPTForCausalLM`                                  | OPT, OPT-IML                                        | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.                                                                                                                         |                        | ✅︎                          | ✅︎                     |
-| `OrionForCausalLM`                                | Orion                                               | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.                                                                                                             |                        | ✅︎                          | ✅︎                     |
-| `PhiForCausalLM`                                  | Phi                                                 | `microsoft/phi-1_5`, `microsoft/phi-2`, etc.                                                                                                                                 | ✅︎                     | ✅︎                          | ✅︎                     |
-| `Phi3ForCausalLM`                                 | Phi-4, Phi-3                                        | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.   | ✅︎                     | ✅︎                          | ✅︎                     |
-| `Phi3SmallForCausalLM`                            | Phi-3-Small                                         | `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc.                                                                                             |                        | ✅︎                          | ✅︎                     |
-| `PhiMoEForCausalLM`                               | Phi-3.5-MoE                                         | `microsoft/Phi-3.5-MoE-instruct`, etc.                                                                                                                                       | ✅︎                     | ✅︎                          | ✅︎                     |
-| `PersimmonForCausalLM`                            | Persimmon                                           | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc.                                                                                                                   |                        | ✅︎                          | ✅︎                     |
-| `Plamo2ForCausalLM`                               | PLaMo2                                              | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc.                                                                                                                                 |                        |                             |                       |
-| `QWenLMHeadModel`                                 | Qwen                                                | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.                                                                                                                                    | ✅︎                     | ✅︎                          | ✅︎                     |
-| `Qwen2ForCausalLM`                                | QwQ, Qwen2                                          | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc.                                                                                                      | ✅︎                     | ✅︎                          | ✅︎                     |
-| `Qwen2MoeForCausalLM`                             | Qwen2MoE                                            | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.                                                                                                                |                        | ✅︎                          | ✅︎                     |
-| `Qwen3ForCausalLM`                                | Qwen3                                               | `Qwen/Qwen3-8B`, etc.                                                                                                                                                        | ✅︎                     | ✅︎                          | ✅︎                     |
-| `Qwen3MoeForCausalLM`                             | Qwen3MoE                                            | `Qwen/Qwen3-30B-A3B`, etc.                                                                                                                                                   |                        | ✅︎                          | ✅︎                     |
-| `StableLmForCausalLM`                             | StableLM                                            | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.                                                                                                |                        |                             | ✅︎                     |
-| `Starcoder2ForCausalLM`                           | Starcoder2                                          | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.                                                                                             |                        | ✅︎                          | ✅︎                     |
-| `SolarForCausalLM`                                | Solar Pro                                           | `upstage/solar-pro-preview-instruct`, etc.                                                                                                                                   | ✅︎                     | ✅︎                          | ✅︎                     |
-| `TeleChat2ForCausalLM`                            | TeleChat2                                           | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc.                                                                                                | ✅︎                     | ✅︎                          | ✅︎                     |
-| `TeleFLMForCausalLM`                              | TeleFLM                                             | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc.                                                                                                                    | ✅︎                     | ✅︎                          | ✅︎                     |
-| `XverseForCausalLM`                               | XVERSE                                              | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.                                                                                            | ✅︎                     | ✅︎                          | ✅︎                     |
-| `MiniMaxM1ForCausalLM`                        | MiniMax-Text                                        | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`etc.                                                                                                                       |                        |                             |                       |
-| `MiniMaxText01ForCausalLM`                        | MiniMax-Text                                        | `MiniMaxAI/MiniMax-Text-01`, etc.                                                                                                                                            |                        |                             |                       |
-| `Zamba2ForCausalLM`                               | Zamba2                                              | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc.                                                                              |                        |                             |                       |
+| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
+|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
+| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ | ✅︎ |
+| `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | |
+| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | |
+| `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | |
+| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ |
+| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | | ✅︎ | ✅︎ |
+| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | | ✅︎ | ✅︎ |
+| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3`, etc. | | ✅︎ | ✅︎ |
+| `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ |
+| `Ernie4_5_ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | | ✅︎ | ✅︎ |
+| `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. | | ✅︎ | ✅︎ |
+| `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ | ✅︎ |
+| `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | | ✅︎ | ✅︎ |
+| `FalconH1ForCausalLM` | Falcon-H1 | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc. | ✅︎ | ✅︎ | |
+| `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Gemma3nForConditionalGeneration` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
+| `GlmForCausalLM` | GLM-4 | `THUDM/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Glm4ForCausalLM` | GLM-4-0414 | `THUDM/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ | ✅︎ |
+| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ |
+| `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ | ✅︎ |
+| `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | |
+| `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | ✅︎ |
+| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | |
+| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ |
+| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | | | ✅︎ |
+| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ | ✅︎ |
+| `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ | |
+| `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ | |
+| `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ | |
+| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | ✅︎ |
+| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | |
+| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | | ✅︎ | ✅︎ |
+| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | | ✅︎ | ✅︎ |
+| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | ✅︎ |
+| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | | ✅︎ | ✅︎ |
+| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | ✅︎ |
+| `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Phi3SmallForCausalLM` | Phi-3-Small | `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc. | | ✅︎ | ✅︎ |
+| `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ | ✅︎ |
+| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | | |
+| `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | | ✅︎ | ✅︎ |
+| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | | ✅︎ | ✅︎ |
+| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | ✅︎ |
+| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ | ✅︎ |
+| `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | | |
+| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | |
+| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | |
 
 !!! note
     Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
@@ -412,19 +412,19 @@ See [this page](./pooling_models.md) for more information on how to use pooling
 
 Specified using `--task embed`.
 
-| Architecture                                           | Models              | Example HF Models                                                                                                   | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779)   |
-|--------------------------------------------------------|---------------------|---------------------------------------------------------------------------------------------------------------------|----------------------|---------------------------|-----------------------|
-| `BertModel`                                            | BERT-based          | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc.                                                |                      |                           |                       |
-| `Gemma2Model`                                          | Gemma 2-based       | `BAAI/bge-multilingual-gemma2`, etc.                                                                                | ✅︎                   |                           | ✅︎                     |
-| `GritLM`                                               | GritLM              | `parasail-ai/GritLM-7B-vllm`.                                                                                       | ✅︎                   | ✅︎                        |                       |
-| `GteModel`                                             | Arctic-Embed-2.0-M  | `Snowflake/snowflake-arctic-embed-m-v2.0`.                                                                          | ︎                     |                           |                       |
-| `GteNewModel`                                          | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc.                                                                           | ︎                     | ︎                         |                       |
-| `ModernBertModel`                                      | ModernBERT-based    | `Alibaba-NLP/gte-modernbert-base`, etc.                                                                             | ︎                     | ︎                         |                       |
-| `NomicBertModel`                                       | Nomic BERT          | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | ︎                     | ︎                         |                       |
-| `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. | Llama-based         | `intfloat/e5-mistral-7b-instruct`, etc.                                                                             | ✅︎                   | ✅︎                        | ✅︎                     |
-| `Qwen2Model`, `Qwen2ForCausalLM`                       | Qwen2-based         | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc.              | ✅︎                   | ✅︎                        | ✅︎                     |
-| `Qwen3Model`, `Qwen3ForCausalLM`                       | Qwen3-based         | `Qwen/Qwen3-Embedding-0.6B`, etc.                                                                                   | ✅︎                   | ✅︎                        | ✅︎                     |
-| `RobertaModel`, `RobertaForMaskedLM`                   | RoBERTa-based       | `sentence-transformers/all-roberta-large-v1`, etc.                                                                  |                      |                           |                       |
+| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
+|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
+| `BertModel` | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | |
+| `Gemma2Model` | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | | ✅︎ |
+| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | |
+| `GteModel` | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. |  |  |  |
+| `GteNewModel` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. |  |  |  |
+| `ModernBertModel` | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. |  |  |  |
+| `NomicBertModel` | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. |  |  |  |
+| `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen2Model`, `Qwen2ForCausalLM` | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen3Model`, `Qwen3ForCausalLM` | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | |
 
 !!! note
     `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
@@ -448,12 +448,12 @@ of the whole prompt are extracted from the normalized hidden state corresponding
 
 Specified using `--task reward`.
 
-| Architecture              | Models          | Example HF Models                                                      | [LoRA][lora-adapter]   | [PP][distributed-serving]   | [V1](gh-issue:8779)   |
-|---------------------------|-----------------|------------------------------------------------------------------------|------------------------|-----------------------------|-----------------------|
-| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎                     | ✅︎                          | ✅︎                     |
-| `LlamaForCausalLM`        | Llama-based     | `peiyi9979/math-shepherd-mistral-7b-prm`, etc.                         | ✅︎                     | ✅︎                          | ✅︎                     |
-| `Qwen2ForRewardModel`     | Qwen2-based     | `Qwen/Qwen2.5-Math-RM-72B`, etc.                                       | ✅︎                     | ✅︎                          | ✅︎                     |
-| `Qwen2ForProcessRewardModel`     | Qwen2-based     | `Qwen/Qwen2.5-Math-PRM-7B`, etc.                                       | ✅︎                     | ✅︎                          | ✅︎                     |
+| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
+|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
+| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 
 If your model is not in the above list, we will try to automatically convert the model using
 [as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly.
@@ -466,10 +466,10 @@ If your model is not in the above list, we will try to automatically convert the
 
 Specified using `--task classify`.
 
-| Architecture                     | Models   | Example HF Models                      | [LoRA][lora-adapter]   | [PP][distributed-serving]   | [V1](gh-issue:8779)   |
-|----------------------------------|----------|----------------------------------------|------------------------|-----------------------------|-----------------------|
-| `JambaForSequenceClassification` | Jamba    | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎                     | ✅︎                          |                       |
-| `GPT2ForSequenceClassification`  | GPT2     | `nie3e/sentiment-polish-gpt2-small`    |                        |                             | ✅︎                     |
+| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
+|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
+| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | |
+| `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ |
 
 If your model is not in the above list, we will try to automatically convert the model using
 [as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
@@ -478,13 +478,13 @@ If your model is not in the above list, we will try to automatically convert the
 
 Specified using `--task score`.
 
-| Architecture                          | Models            | Example HF Models                                                                    | [V1](gh-issue:8779) |
-|---------------------------------------|-------------------|--------------------------------------------------------------------------------------|---------------------|
-| `BertForSequenceClassification`       | BERT-based        | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.                                         |                     |
-| `Qwen2ForSequenceClassification`      | Qwen2-based       | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc.                                | ✅︎                  |
-| `Qwen3ForSequenceClassification`      | Qwen3-based       | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎                  |
-| `RobertaForSequenceClassification`    | RoBERTa-based     | `cross-encoder/quora-roberta-base`, etc.                                             |                     |
-| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc.                                                      |                     |
+| Architecture | Models | Example HF Models | [V1](gh-issue:8779) |
+|--------------|--------|-------------------|---------------------|
+| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | |
+| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ |
+| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ |
+| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | |
+| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | |
 
 !!! note
     Load the official original `mxbai-rerank-v2` by using the following command.
@@ -555,50 +555,50 @@ See [this page][generative-models] for more information on how to use generative
 
 Specified using `--task generate`.
 
-| Architecture                                 | Models                                                                   | Inputs                                                                | Example HF Models                                                                                                                                       | [LoRA][lora-adapter]   | [PP][distributed-serving]   | [V1](gh-issue:8779)   |
-|----------------------------------------------|--------------------------------------------------------------------------|-----------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|-----------------------|
-| `AriaForConditionalGeneration`               | Aria                                                                     | T + I<sup>+</sup>                                                     | `rhymes-ai/Aria`                                                                                                                                        |                        |                             | ✅︎                    |
-| `AyaVisionForConditionalGeneration`          | Aya Vision                                                               | T + I<sup>+</sup>                                                     | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc.                                                                                         |                        | ✅︎                          | ✅︎                    |
-| `Blip2ForConditionalGeneration`              | BLIP-2                                                                   | T + I<sup>E</sup>                                                     | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc.                                                                                          |                        | ✅︎                          | ✅︎                    |
-| `ChameleonForConditionalGeneration`          | Chameleon                                                                | T + I                                                                 | `facebook/chameleon-7b` etc.                                                                                                                            |                        | ✅︎                          | ✅︎                    |
-| `DeepseekVLV2ForCausalLM`<sup>^</sup>        | DeepSeek-VL2                                                             | T + I<sup>+</sup>                                                     | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc.                                                      |                        | ✅︎                          | ✅︎                    |
-| `Florence2ForConditionalGeneration`          | Florence-2                                                               | T + I                                                                 | `microsoft/Florence-2-base`, `microsoft/Florence-2-large` etc.                                                                                          |                        |                             |                       |
-| `FuyuForCausalLM`                            | Fuyu                                                                     | T + I                                                                 | `adept/fuyu-8b` etc.                                                                                                                                    |                        | ✅︎                          | ✅︎                    |
-| `Gemma3ForConditionalGeneration`             | Gemma 3                                                                  | T + I<sup>+</sup>                                                     | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.                                                                                                   | ✅︎                     | ✅︎                          | ⚠️                    |
-| `GLM4VForCausalLM`<sup>^</sup>               | GLM-4V                                                                   | T + I                                                                 | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220` etc.                                                                                                    | ✅︎                     | ✅︎                          | ✅︎                    |
-| `Glm4vForConditionalGeneration`              | GLM-4.1V-Thinking                                                        | T + I<sup>E+</sup> + V<sup>E+</sup>                                   | `THUDM/GLM-4.1V-9B-Thinkg`,  etc.                                                                                                                       | ✅︎                     | ✅︎                          | ✅︎                    |
-| `GraniteSpeechForConditionalGeneration`      | Granite Speech                                                           | T + A                                                                 | `ibm-granite/granite-speech-3.3-8b`                                                                                                                     | ✅︎                     | ✅︎                          | ✅︎                    |
-| `H2OVLChatModel`                             | H2OVL                                                                    | T + I<sup>E+</sup>                                                    | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.                                                                                      |                        | ✅︎                          | ✅︎\*                  |
-| `Idefics3ForConditionalGeneration`           | Idefics3                                                                 | T + I                                                                 | `HuggingFaceM4/Idefics3-8B-Llama3` etc.                                                                                                                 | ✅︎                     |                             |  ✅︎                   |
-| `InternVLChatModel`                          | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>)                                 | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.  | ✅︎                     | ✅︎                          | ✅︎                    |
-| `KeyeForConditionalGeneration`               | Keye-VL-8B-Preview                                                       | T + I<sup>E+</sup> + V<sup>E+</sup>                                   | `Kwai-Keye/Keye-VL-8B-Preview`                                                                                                                          |                        |                             | ✅︎                    |
-| `KimiVLForConditionalGeneration`             | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking                               | T + I<sup>+</sup>                                                     | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking`                                                                                    |                        |                             | ✅︎                    |
-| `Llama4ForConditionalGeneration`             | Llama 4                                                                  | T + I<sup>+</sup>                                                     | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. |                        | ✅︎                          | ✅︎                    |
-| `LlavaForConditionalGeneration`              | LLaVA-1.5                                                                | T + I<sup>E+</sup>                                                    | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.                                                                        |                        | ✅︎                          | ✅︎                    |
-| `LlavaNextForConditionalGeneration`          | LLaVA-NeXT                                                               | T + I<sup>E+</sup>                                                    | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc.                                                                           |                        | ✅︎                          | ✅︎                    |
-| `LlavaNextVideoForConditionalGeneration`     | LLaVA-NeXT-Video                                                         | T + V                                                                 | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.                                                                                                                 |                        | ✅︎                          | ✅︎                    |
-| `LlavaOnevisionForConditionalGeneration`     | LLaVA-Onevision                                                          | T + I<sup>+</sup> + V<sup>+</sup>                                     | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.                                                            |                        | ✅︎                          | ✅︎                    |
-| `MiniCPMO`                                   | MiniCPM-O                                                                | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup>                  | `openbmb/MiniCPM-o-2_6`, etc.                                                                                                                           | ✅︎                     | ✅︎                          | ✅︎                    |
-| `MiniCPMV`                                   | MiniCPM-V                                                                | T + I<sup>E+</sup> + V<sup>E+</sup>                                   | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc.                                                         | ✅︎                     |                             | ✅︎                    |
-| `MiniMaxVL01ForConditionalGeneration`        | MiniMax-VL                                                               | T + I<sup>E+</sup>                                                    | `MiniMaxAI/MiniMax-VL-01`, etc.                                                                                                                         |                        | ✅︎                          | ✅︎                    |
-| `Mistral3ForConditionalGeneration`           | Mistral3                                                                 | T + I<sup>+</sup>                                                     | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc.                                                                                                   | ✅︎                     | ✅︎                          | ✅︎                    |
-| `MllamaForConditionalGeneration`             | Llama 3.2                                                                | T + I<sup>+</sup>                                                     | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc.                                                                     |                        |                             |                       |
-| `MolmoForCausalLM`                           | Molmo                                                                    | T + I<sup>+</sup>                                                     | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc.                                                                                              | ✅︎                     | ✅︎                          | ✅︎                    |
-| `NVLM_D_Model`                               | NVLM-D 1.0                                                               | T + I<sup>+</sup>                                                     | `nvidia/NVLM-D-72B`, etc.                                                                                                                               |                        | ✅︎                          | ✅︎                    |
-| `Ovis`                                       | Ovis2, Ovis1.6                                                           | T + I<sup>+</sup>                                                     | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc.                                                                                                 |                        | ✅︎                          | ✅︎                    |
-| `PaliGemmaForConditionalGeneration`          | PaliGemma, PaliGemma 2                                                   | T + I<sup>E</sup>                                                     | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.                                                  |                        | ✅︎                          | ⚠️                    |
-| `Phi3VForCausalLM`                           | Phi-3-Vision, Phi-3.5-Vision                                             | T + I<sup>E+</sup>                                                    | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc.                                                                       |                        | ✅︎                          |  ✅︎                   |
-| `Phi4MMForCausalLM`                          | Phi-4-multimodal                                                         | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc.                                                                                                             | ✅︎                     | ✅︎                          | ✅︎                    |
-| `PixtralForConditionalGeneration`            | Pixtral                                                                  | T + I<sup>+</sup>                                                     | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc.                                                                  |                        | ✅︎                          | ✅︎                    |
-| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL                                                                  | T + I<sup>E+</sup>                                                    | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc.                                                                                                               | ✅︎                     | ✅︎                          | ✅︎                    |
-| `Qwen2AudioForConditionalGeneration`         | Qwen2-Audio                                                              | T + A<sup>+</sup>                                                     | `Qwen/Qwen2-Audio-7B-Instruct`                                                                                                                          |                        | ✅︎                          | ✅︎                    |
-| `Qwen2VLForConditionalGeneration`            | QVQ, Qwen2-VL                                                            | T + I<sup>E+</sup> + V<sup>E+</sup>                                   | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc.                                                                 | ✅︎                     | ✅︎                          | ✅︎                    |
-| `Qwen2_5_VLForConditionalGeneration`         | Qwen2.5-VL                                                               | T + I<sup>E+</sup> + V<sup>E+</sup>                                   | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc.                                                                                     | ✅︎                     | ✅︎                          | ✅︎                    |
-| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni                                                             | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup>                   | `Qwen/Qwen2.5-Omni-7B`                                                                                                                                  |                        | ✅︎                          | ✅︎\*                  |
-| `SkyworkR1VChatModel`                        | Skywork-R1V-38B                                                          | T + I                                                                 | `Skywork/Skywork-R1V-38B`                                                                                                                               |                        | ✅︎                          | ✅︎                    |
-| `SmolVLMForConditionalGeneration`            | SmolVLM2                                                                 | T + I                                                                 | `SmolVLM2-2.2B-Instruct`                                                                                                                                | ✅︎                     |                             | ✅︎                    |
-| `TarsierForConditionalGeneration`            | Tarsier                                                                  | T + I<sup>E+</sup>                                                    | `omni-search/Tarsier-7b`,`omni-search/Tarsier-34b`                                                                                                      |                        | ✅︎                          | ✅︎                    |
-| `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2                                                                 | T + I<sup>E+</sup> + V<sup>E+</sup>                                                | `omni-research/Tarsier2-Recap-7b`,`omni-research/Tarsier2-7b-0115`                                                                                      |                        | ✅︎                          | ✅︎                    |
+| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
+|--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
+| `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | | ✅︎ |
+| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | ✅︎ |
+| `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ | ✅︎ |
+| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ | ✅︎ |
+| `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | ✅︎ |
+| `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large`, etc. | | | |
+| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ |
+| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ |
+| `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `THUDM/GLM-4.1V-9B-Thinkg`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
+| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎\* |
+| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
+| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
+| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
+| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
+| `LlavaForConditionalGeneration` | LLaVA-1.5 | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. | | ✅︎ | ✅︎ |
+| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | ✅︎ |
+| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ |
+| `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ |
+| `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. | ✅︎ | | ✅︎ |
+| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ |
+| `Mistral3ForConditionalGeneration` | Mistral3 | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `MllamaForConditionalGeneration` | Llama 3.2 | T + I<sup>+</sup> | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | |
+| `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ |
+| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | ✅︎ |
+| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ |
+| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ |
+| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `PixtralForConditionalGeneration` | Pixtral | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc. | | ✅︎ | ✅︎ |
+| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ |
+| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-7B` | | ✅︎ | ✅︎\* |
+| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
+| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |
+| `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ | ✅︎ |
+| `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ | ✅︎ |
 
 <sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.  
 &nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:  
@@ -677,9 +677,9 @@ Specified using `--task transcription`.
 
 Speech2Text models trained specifically for Automatic Speech Recognition.
 
-| Architecture                                 | Models           | Example HF Models                                                | [LoRA][lora-adapter]   | [PP][distributed-serving]   | [V1](gh-issue:8779)   |
-|----------------------------------------------|------------------|------------------------------------------------------------------|------------------------|-----------------------------|-----------------------|
-| `WhisperForConditionalGeneration`            | Whisper          | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc.    |                        |                             |                       |
+| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
+|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
+| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | |
 
 ### Pooling Models
 
@@ -700,10 +700,10 @@ Any text generation model can be converted into an embedding model by passing `-
 
 The following table lists those that are tested in vLLM.
 
-| Architecture                        | Models             | Inputs   | Example HF Models        | [LoRA][lora-adapter]   | [PP][distributed-serving]   | [V1](gh-issue:8779)   |
-|-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------|
-| `LlavaNextForConditionalGeneration` | LLaVA-NeXT-based   | T / I    | `royokong/e5-v`          |                        |                             |                       |
-| `Phi3VForCausalLM`                  | Phi-3-Vision-based | T + I    | `TIGER-Lab/VLM2Vec-Full` | 🚧                     | ✅︎                          |                       |
+| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
+|--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
+| `LlavaNextForConditionalGeneration` | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | | |
+| `Phi3VForCausalLM` | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ | |
 
 ---
 

From 448acad31eae34508a7e0ed0877b95dad8df8bb9 Mon Sep 17 00:00:00 2001
From: Abirdcfly <fp544037857@gmail.com>
Date: Mon, 7 Jul 2025 17:14:12 +0800
Subject: [PATCH 129/167] [Misc] remove unused jinaai_serving_reranking
 (#18878)

Signed-off-by: Abirdcfly <fp544037857@gmail.com>
---
 vllm/entrypoints/openai/api_server.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index d3b1a3802bba1..e3285a9bf76d1 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1409,11 +1409,6 @@ async def init_app_state(
 
     enable_serving_reranking = (model_config.task == "classify" and getattr(
         model_config.hf_config, "num_labels", 0) == 1)
-    state.jinaai_serving_reranking = ServingScores(
-        engine_client,
-        model_config,
-        state.openai_serving_models,
-        request_logger=request_logger) if enable_serving_reranking else None
     state.openai_serving_scores = ServingScores(
         engine_client,
         model_config,

From 4ff79a136ec466684e74502057acba578cfe947c Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Mon, 7 Jul 2025 17:15:26 +0800
Subject: [PATCH 130/167] [Misc] Set the minimum openai version (#20539)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 requirements/common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 8bc0be7779aff..90946df00d5d5 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -13,7 +13,7 @@ tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp
-openai >= 1.52.0, <= 1.90.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
+openai >= 1.87.0, <= 1.90.0 # Ensure modern openai package (ensure ResponsePrompt exists in type.responses and max_completion_tokens field support)
 pydantic >= 2.10
 prometheus_client >= 0.18.0
 pillow  # Required for image processing

From 6e4bef1bea89c06100699bad4d4ad27ef0519e7f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 7 Jul 2025 11:35:47 +0100
Subject: [PATCH 131/167] [Doc] Remove extra whitespace from CI failures doc
 (#20565)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/contributing/ci-failures.md | 40 ++++++++++++++++----------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/docs/contributing/ci-failures.md b/docs/contributing/ci-failures.md
index 7caaf10ceb5c8..573efb3b05f6e 100644
--- a/docs/contributing/ci-failures.md
+++ b/docs/contributing/ci-failures.md
@@ -6,9 +6,9 @@ the failure?
 - Check the dashboard of current CI test failures:  
   👉 [CI Failures Dashboard](https://github.com/orgs/vllm-project/projects/20)
 
-- If your failure **is already listed**, it's likely unrelated to your PR.  
-  Help fixing it is always welcome!  
-    - Leave comments with links to additional instances of the failure.  
+- If your failure **is already listed**, it's likely unrelated to your PR.
+  Help fixing it is always welcome!
+    - Leave comments with links to additional instances of the failure.
     - React with a 👍 to signal how many are affected.
 
 - If your failure **is not listed**, you should **file an issue**.
@@ -19,25 +19,25 @@ the failure?
     👉 [New CI Failure Report](https://github.com/vllm-project/vllm/issues/new?template=450-ci-failure.yml)
 
 - **Use this title format:**
-  
+
     ```
     [CI Failure]: failing-test-job - regex/matching/failing:test
     ```
 
 - **For the environment field:**
-  
+
     ```
  Still failing on main as of commit abcdef123
     ```
 
 - **In the description, include failing tests:**
-  
+
     ```
-    FAILED failing/test.py:failing_test1 - Failure description  
-     FAILED failing/test.py:failing_test2 - Failure description  
-    https://github.com/orgs/vllm-project/projects/20  
-    https://github.com/vllm-project/vllm/issues/new?template=400-bug-report.yml  
-    FAILED failing/test.py:failing_test3 - Failure description  
+    FAILED failing/test.py:failing_test1 - Failure description
+    FAILED failing/test.py:failing_test2 - Failure description
+    https://github.com/orgs/vllm-project/projects/20
+    https://github.com/vllm-project/vllm/issues/new?template=400-bug-report.yml
+    FAILED failing/test.py:failing_test3 - Failure description
     ```
 
 - **Attach logs** (collapsible section example):
@@ -45,17 +45,17 @@ the failure?
     <summary>Logs:</summary>
 
     ```text
-    ERROR 05-20 03:26:38 [dump_input.py:68] Dumping input data  
+    ERROR 05-20 03:26:38 [dump_input.py:68] Dumping input data
     --- Logging error ---  
     Traceback (most recent call last):  
       File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 203, in execute_model  
-        return self.model_executor.execute_model(scheduler_output)  
+        return self.model_executor.execute_model(scheduler_output)
     ...
-    FAILED failing/test.py:failing_test1 - Failure description  
-    FAILED failing/test.py:failing_test2 - Failure description  
-    FAILED failing/test.py:failing_test3 - Failure description  
+    FAILED failing/test.py:failing_test1 - Failure description
+    FAILED failing/test.py:failing_test2 - Failure description
+    FAILED failing/test.py:failing_test3 - Failure description
     ```
-  
+
     </details>
 
 ## Logs Wrangling
@@ -78,7 +78,7 @@ tail -525 ci_build.log | wl-copy
 
 ## Investigating a CI Test Failure
 
-1. Go to 👉 [Buildkite main branch](https://buildkite.com/vllm/ci/builds?branch=main)  
+1. Go to 👉 [Buildkite main branch](https://buildkite.com/vllm/ci/builds?branch=main)
 2. Bisect to find the first build that shows the issue.  
 3. Add your findings to the GitHub issue.  
 4. If you find a strong candidate PR, mention it in the issue and ping contributors.
@@ -97,9 +97,9 @@ CI test failures may be flaky. Use a bash loop to run repeatedly:
 
 If you submit a PR to fix a CI failure:
 
-- Link the PR to the issue:  
+- Link the PR to the issue:
   Add `Closes #12345` to the PR description.
-- Add the `ci-failure` label:  
+- Add the `ci-failure` label:
   This helps track it in the [CI Failures GitHub Project](https://github.com/orgs/vllm-project/projects/20).
 
 ## Other Resources

From 45877ef740e00cbb2dbe9fd7edc84638adc13037 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 7 Jul 2025 11:54:22 +0100
Subject: [PATCH 132/167] [Doc] Use `gh-pr` and `gh-issue` everywhere we can in
 the docs (#20564)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/ci/update_pytorch_version.md | 12 +++++-------
 docs/features/spec_decode.md      |  6 +++---
 docs/usage/troubleshooting.md     |  4 ++--
 docs/usage/v1_guide.md            | 24 ++++++++++++------------
 4 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/docs/ci/update_pytorch_version.md b/docs/ci/update_pytorch_version.md
index 69fdc82ef9711..eb8f194557912 100644
--- a/docs/ci/update_pytorch_version.md
+++ b/docs/ci/update_pytorch_version.md
@@ -7,9 +7,8 @@ release in CI/CD. It is standard practice to submit a PR to update the
 PyTorch version as early as possible when a new [PyTorch stable
 release](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-cadence) becomes available.
 This process is non-trivial due to the gap between PyTorch
-releases. Using [#16859](https://github.com/vllm-project/vllm/pull/16859) as
-an example, this document outlines common steps to achieve this update along with
-a list of potential issues and how to address them.
+releases. Using <gh-pr:16859> as an example, this document outlines common steps to achieve this
+update along with a list of potential issues and how to address them.
 
 ## Test PyTorch release candidates (RCs)
 
@@ -68,7 +67,7 @@ and timeout. Additionally, since vLLM's fastcheck pipeline runs in read-only mod
 it doesn't populate the cache, so re-running it to warm up the cache
 is ineffective.
 
-While ongoing efforts like [#17419](https://github.com/vllm-project/vllm/issues/17419)
+While ongoing efforts like [#17419](gh-issue:17419)
 address the long build time at its source, the current workaround is to set VLLM_CI_BRANCH
 to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`)
 when manually triggering a build on Buildkite. This branch accomplishes two things:
@@ -129,6 +128,5 @@ to handle some platforms separately. The separation of requirements and Dockerfi
 for different platforms in vLLM CI/CD allows us to selectively choose
 which platforms to update. For instance, updating XPU requires the corresponding
 release from https://github.com/intel/intel-extension-for-pytorch by Intel.
-While https://github.com/vllm-project/vllm/pull/16859 updated vLLM to PyTorch
-2.7.0 on CPU, CUDA, and ROCm, https://github.com/vllm-project/vllm/pull/17444
-completed the update for XPU.
+While <gh-pr:16859> updated vLLM to PyTorch 2.7.0 on CPU, CUDA, and ROCm,
+<gh-pr:17444> completed the update for XPU.
diff --git a/docs/features/spec_decode.md b/docs/features/spec_decode.md
index abda7db53f912..f28a74ce2262a 100644
--- a/docs/features/spec_decode.md
+++ b/docs/features/spec_decode.md
@@ -217,8 +217,8 @@ an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https
 A few important things to consider when using the EAGLE based draft models:
 
 1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) should
-   be able to be loaded and used directly by vLLM after [PR 12304](https://github.com/vllm-project/vllm/pull/12304).
-   If you are using vllm version before [PR 12304](https://github.com/vllm-project/vllm/pull/12304), please use the
+   be able to be loaded and used directly by vLLM after <gh-pr:12304>.
+   If you are using vllm version before <gh-pr:12304>, please use the
    [script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert the speculative model,
    and specify `"model": "path/to/modified/eagle/model"` in `speculative_config`. If weight-loading problems still occur when using the latest version of vLLM, please leave a comment or raise an issue.
 
@@ -228,7 +228,7 @@ A few important things to consider when using the EAGLE based draft models:
 
 3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is
    reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under
-   investigation and tracked here: [https://github.com/vllm-project/vllm/issues/9565](https://github.com/vllm-project/vllm/issues/9565).
+   investigation and tracked here: <gh-issue:9565>.
 
 A variety of EAGLE draft models are available on the Hugging Face hub:
 
diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md
index 7f1f76ce3d2e3..2b7abc7f46dff 100644
--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@@ -212,7 +212,7 @@ if __name__ == '__main__':
 
 ## `torch.compile` Error
 
-vLLM heavily depends on `torch.compile` to optimize the model for better performance, which introduces the dependency on the `torch.compile` functionality and the `triton` library. By default, we use `torch.compile` to [optimize some functions](https://github.com/vllm-project/vllm/pull/10406) in the model. Before running vLLM, you can check if `torch.compile` is working as expected by running the following script:
+vLLM heavily depends on `torch.compile` to optimize the model for better performance, which introduces the dependency on the `torch.compile` functionality and the `triton` library. By default, we use `torch.compile` to [optimize some functions](gh-pr:10406) in the model. Before running vLLM, you can check if `torch.compile` is working as expected by running the following script:
 
 ??? Code
 
@@ -231,7 +231,7 @@ vLLM heavily depends on `torch.compile` to optimize the model for better perform
     print(f(x))
     ```
 
-If it raises errors from `torch/_inductor` directory, usually it means you have a custom `triton` library that is not compatible with the version of PyTorch you are using. See [this issue](https://github.com/vllm-project/vllm/issues/12219) for example.
+If it raises errors from `torch/_inductor` directory, usually it means you have a custom `triton` library that is not compatible with the version of PyTorch you are using. See <gh-issue:12219> for example.
 
 ## Model failed to be inspected
 
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 82a2710d895ca..f2a7679f5c512 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -2,7 +2,7 @@
 
 !!! announcement
 
-    We have started the process of deprecating V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details.
+    We have started the process of deprecating V0. Please read [RFC #18571](gh-issue:18571) for more details.
 
 V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
 
@@ -83,7 +83,7 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
 | **Decoder-only Models**     | <nobr>🚀 Optimized</nobr>                                                          |
 | **Encoder-Decoder Models**  | <nobr>🟠 Delayed</nobr>                                                            |
 | **Embedding Models**        | <nobr>🟢 Functional</nobr>                                                         |
-| **Mamba Models**            | <nobr>🚧 WIP ([PR #19327](https://github.com/vllm-project/vllm/pull/19327))</nobr> |
+| **Mamba Models**            | <nobr>🚧 WIP (<gh-pr:19327>)</nobr> |
 | **Multimodal Models**       | <nobr>🟢 Functional</nobr>                                                         |
 
 vLLM V1 currently excludes model architectures with the `SupportsV0Only` protocol.
@@ -98,14 +98,14 @@ See below for the status of models that are not yet supported or have more featu
 
 The initial basic support is now functional.
 
-Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249),
-which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360)
+Later, we will consider using [hidden states processor](gh-issue:12249),
+which is based on [global logits processor](gh-pr:13360)
 to enable simultaneous generation and embedding using the same engine instance in V1.
 
 #### Mamba Models
 
 Models using selective state-space mechanisms instead of standard transformer attention (e.g., `MambaForCausalLM`, `JambaForCausalLM`)
-will be supported via [PR #19327](https://github.com/vllm-project/vllm/pull/19327).
+will be supported via <gh-pr:19327>.
 
 #### Encoder-Decoder Models
 
@@ -120,13 +120,13 @@ are not yet supported.
 | **Chunked Prefill**                         | <nobr>🚀 Optimized</nobr>                                                         |
 | **LoRA**                                    | <nobr>🚀 Optimized</nobr>                                                         |
 | **Logprobs Calculation**                    | <nobr>🟢 Functional</nobr>                                                        |
-| **FP8 KV Cache**                            | <nobr>🟢 Functional on Hopper devices ([PR #15191](https://github.com/vllm-project/vllm/pull/15191))</nobr>|
+| **FP8 KV Cache**                            | <nobr>🟢 Functional on Hopper devices (<gh-pr:15191>)</nobr>|
 | **Spec Decode**                             | <nobr>🚀 Optimized</nobr>                                                         |
-| **Prompt Logprobs with Prefix Caching**     | <nobr>🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))</nobr>|
+| **Prompt Logprobs with Prefix Caching**     | <nobr>🟡 Planned ([RFC #13414](gh-issue:13414))</nobr>|
 | **Structured Output Alternative Backends**  | <nobr>🟢 Functional</nobr>                                                        |
 | **Request-level Structured Output Backend** | <nobr>🔴 Deprecated</nobr>                                                        |
-| **best_of**                                 | <nobr>🔴 Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))</nobr>|
-| **Per-Request Logits Processors**           | <nobr>🔴 Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360))</nobr> |
+| **best_of**                                 | <nobr>🔴 Deprecated ([RFC #13361](gh-issue:13361))</nobr>|
+| **Per-Request Logits Processors**           | <nobr>🔴 Deprecated ([RFC #13360](gh-pr:13360))</nobr> |
 | **GPU <> CPU KV Cache Swapping**            | <nobr>🔴 Deprecated</nobr>                                                        |
 
 !!! note
@@ -153,7 +153,7 @@ Support for logprobs with post-sampling adjustments is in progress and will be a
 
 **Prompt Logprobs with Prefix Caching**
 
-Currently prompt logprobs are only supported when prefix caching is turned off via `--no-enable-prefix-caching`. In a future release, prompt logprobs will be compatible with prefix caching, but a recomputation will be triggered to recover the full prompt logprobs even upon a prefix cache hit. See details in [RFC #13414](https://github.com/vllm-project/vllm/issues/13414).
+Currently prompt logprobs are only supported when prefix caching is turned off via `--no-enable-prefix-caching`. In a future release, prompt logprobs will be compatible with prefix caching, but a recomputation will be triggered to recover the full prompt logprobs even upon a prefix cache hit. See details in [RFC #13414](gh-issue:13414).
 
 #### Deprecated Features
 
@@ -161,11 +161,11 @@ As part of the major architectural rework in vLLM V1, several legacy features ha
 
 **Sampling features**
 
-- **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](https://github.com/vllm-project/vllm/issues/13361).
+- **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](gh-issue:13361).
 - **Per-Request Logits Processors**: In V0, users could pass custom
   processing functions to adjust logits on a per-request basis. In vLLM V1, this
   feature has been deprecated. Instead, the design is moving toward supporting **global logits
-  processors**, a feature the team is actively working on for future releases. See details at [RFC #13360](https://github.com/vllm-project/vllm/pull/13360).
+  processors**, a feature the team is actively working on for future releases. See details at [RFC #13360](gh-pr:13360).
 
 **KV Cache features**
 

From 923147b5e8551887fd64a0fc242c361d5216e1d7 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 7 Jul 2025 12:15:50 +0100
Subject: [PATCH 133/167] [Doc] Fix internal links so they don't always point
 to latest (#20563)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/features/structured_outputs.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
index 614b0bfe9679b..ea1d09644835f 100644
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -157,7 +157,7 @@ As an example, we can use to define a specific format of simplified SQL queries:
     print(completion.choices[0].message.content)
     ```
 
-See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
+See also: [full example](../examples/online_serving/structured_outputs.md)
 
 ## Reasoning Outputs
 
@@ -200,7 +200,7 @@ Note that you can use reasoning with any provided structured outputs feature. Th
     print("content: ", completion.choices[0].message.content)
     ```
 
-See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
+See also: [full example](../examples/online_serving/structured_outputs.md)
 
 ## Experimental Automatic Parsing (OpenAI API)
 
@@ -325,4 +325,4 @@ shown below:
     print(outputs[0].outputs[0].text)
     ```
 
-See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
+See also: [full example](../examples/online_serving/structured_outputs.md)

From b8a498c9b2f3563666e830bf2ad7b9a888c184ed Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 7 Jul 2025 15:43:26 +0100
Subject: [PATCH 134/167] [Doc] Add outline for content tabs (#20571)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/mkdocs/stylesheets/extra.css | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/docs/mkdocs/stylesheets/extra.css b/docs/mkdocs/stylesheets/extra.css
index 892013c1cddfa..5df9f1344012f 100644
--- a/docs/mkdocs/stylesheets/extra.css
+++ b/docs/mkdocs/stylesheets/extra.css
@@ -143,3 +143,13 @@ body[data-md-color-scheme="slate"] .md-nav__item--section > label.md-nav__link .
 [data-md-color-scheme="slate"] .logo-light {
   display: none;
 }
+
+/* Outline for content tabs */
+.md-typeset .tabbed-set {
+  border: 0.075rem solid var(--md-default-fg-color);
+  border-radius: 0.2rem;
+}
+
+.md-typeset .tabbed-content {
+  padding: 0 0.6em;
+}
\ No newline at end of file

From 1ad69e8375e841095c2f682299be487fd9b8f47e Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 7 Jul 2025 15:44:34 +0100
Subject: [PATCH 135/167] [Doc] Fix some MkDocs snippets used in the
 installation docs (#20572)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/getting_started/installation/cpu/apple.inc.md |  3 ---
 docs/getting_started/installation/cpu/arm.inc.md   |  3 ---
 docs/getting_started/installation/cpu/s390x.inc.md |  3 ---
 docs/getting_started/installation/cpu/x86.inc.md   |  3 ---
 docs/getting_started/installation/gpu.md           |  4 ++--
 docs/getting_started/installation/gpu/cuda.inc.md  |  4 ----
 docs/getting_started/installation/gpu/rocm.inc.md  | 10 ++++++----
 docs/getting_started/installation/gpu/xpu.inc.md   |  6 ++----
 8 files changed, 10 insertions(+), 26 deletions(-)

diff --git a/docs/getting_started/installation/cpu/apple.inc.md b/docs/getting_started/installation/cpu/apple.inc.md
index 1771213f5591d..e17823b864ceb 100644
--- a/docs/getting_started/installation/cpu/apple.inc.md
+++ b/docs/getting_started/installation/cpu/apple.inc.md
@@ -54,9 +54,6 @@ If the build has error like the following snippet where standard C++ headers can
 ```
 
 # --8<-- [end:build-wheel-from-source]
-# --8<-- [start:set-up-using-docker]
-
-# --8<-- [end:set-up-using-docker]
 # --8<-- [start:pre-built-images]
 
 # --8<-- [end:pre-built-images]
diff --git a/docs/getting_started/installation/cpu/arm.inc.md b/docs/getting_started/installation/cpu/arm.inc.md
index 6c05900cf45c1..18112243c68fe 100644
--- a/docs/getting_started/installation/cpu/arm.inc.md
+++ b/docs/getting_started/installation/cpu/arm.inc.md
@@ -28,9 +28,6 @@ ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
 Testing has been conducted on AWS Graviton3 instances for compatibility.
 
 # --8<-- [end:build-wheel-from-source]
-# --8<-- [start:set-up-using-docker]
-
-# --8<-- [end:set-up-using-docker]
 # --8<-- [start:pre-built-images]
 
 # --8<-- [end:pre-built-images]
diff --git a/docs/getting_started/installation/cpu/s390x.inc.md b/docs/getting_started/installation/cpu/s390x.inc.md
index 6c6c40baececd..67b96a8a04fa3 100644
--- a/docs/getting_started/installation/cpu/s390x.inc.md
+++ b/docs/getting_started/installation/cpu/s390x.inc.md
@@ -56,9 +56,6 @@ Execute the following commands to build and install vLLM from the source.
 ```
 
 # --8<-- [end:build-wheel-from-source]
-# --8<-- [start:set-up-using-docker]
-
-# --8<-- [end:set-up-using-docker]
 # --8<-- [start:pre-built-images]
 
 # --8<-- [end:pre-built-images]
diff --git a/docs/getting_started/installation/cpu/x86.inc.md b/docs/getting_started/installation/cpu/x86.inc.md
index 0412d4ccef00b..dc007dcff217c 100644
--- a/docs/getting_started/installation/cpu/x86.inc.md
+++ b/docs/getting_started/installation/cpu/x86.inc.md
@@ -31,9 +31,6 @@ vLLM initially supports basic model inferencing and serving on x86 CPU platform,
     - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building.
 
 # --8<-- [end:build-wheel-from-source]
-# --8<-- [start:set-up-using-docker]
-
-# --8<-- [end:set-up-using-docker]
 # --8<-- [start:pre-built-images]
 
 See [https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo)
diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md
index 1be7557b79e5f..e688cefea0763 100644
--- a/docs/getting_started/installation/gpu.md
+++ b/docs/getting_started/installation/gpu.md
@@ -46,11 +46,11 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
 === "AMD ROCm"
 
-    There is no extra information on creating a new Python environment for this device.
+    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:set-up-using-python"
 
 === "Intel XPU"
 
-    There is no extra information on creating a new Python environment for this device.
+    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:set-up-using-python"
 
 ### Pre-built wheels
 
diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md
index 0417a25f85adc..5ca5296d0a657 100644
--- a/docs/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/getting_started/installation/gpu/cuda.inc.md
@@ -232,9 +232,6 @@ pip install -e .
 ```
 
 # --8<-- [end:build-wheel-from-source]
-# --8<-- [start:set-up-using-docker]
-
-# --8<-- [end:set-up-using-docker]
 # --8<-- [start:pre-built-images]
 
 See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for instructions on using the official Docker image.
@@ -261,4 +258,3 @@ See [deployment-docker-build-image-from-source][deployment-docker-build-image-fr
 See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information.
 
 # --8<-- [end:supported-features]
-# --8<-- [end:extra-information]
diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md
index aa4cacaf1aedd..3765807ba21d5 100644
--- a/docs/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/getting_started/installation/gpu/rocm.inc.md
@@ -2,6 +2,9 @@
 
 vLLM supports AMD GPUs with ROCm 6.3.
 
+!!! tip
+    [Docker](#set-up-using-docker) is the recommended way to use vLLM on ROCm.
+
 !!! warning
     There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
 
@@ -14,6 +17,8 @@ vLLM supports AMD GPUs with ROCm 6.3.
 # --8<-- [end:requirements]
 # --8<-- [start:set-up-using-python]
 
+There is no extra information on creating a new Python environment for this device.
+
 # --8<-- [end:set-up-using-python]
 # --8<-- [start:pre-built-wheels]
 
@@ -123,9 +128,7 @@ Currently, there are no pre-built ROCm wheels.
     - For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level.
       For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization).
 
-## Set up using Docker (Recommended)
-
-# --8<-- [end:set-up-using-docker]
+# --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]
 
 The [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized
@@ -227,4 +230,3 @@ Where the `<path/to/model>` is the location where the model is stored, for examp
 See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information.
 
 # --8<-- [end:supported-features]
-# --8<-- [end:extra-information]
diff --git a/docs/getting_started/installation/gpu/xpu.inc.md b/docs/getting_started/installation/gpu/xpu.inc.md
index 1514a0c2d3cd4..b77c4e00cf0c4 100644
--- a/docs/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/getting_started/installation/gpu/xpu.inc.md
@@ -14,6 +14,8 @@ vLLM initially supports basic model inference and serving on Intel GPU platform.
 # --8<-- [end:requirements]
 # --8<-- [start:set-up-using-python]
 
+There is no extra information on creating a new Python environment for this device.
+
 # --8<-- [end:set-up-using-python]
 # --8<-- [start:pre-built-wheels]
 
@@ -43,9 +45,6 @@ VLLM_TARGET_DEVICE=xpu python setup.py install
       type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet.
 
 # --8<-- [end:build-wheel-from-source]
-# --8<-- [start:set-up-using-docker]
-
-# --8<-- [end:set-up-using-docker]
 # --8<-- [start:pre-built-images]
 
 Currently, there are no pre-built XPU images.
@@ -86,4 +85,3 @@ By default, a ray instance will be launched automatically if no existing one is
 XPU platform uses **torch-ccl** for torch<2.8 and **xccl** for torch>=2.8 as distributed backend, since torch 2.8 supports **xccl** as built-in backend for XPU.
 
 # --8<-- [end:distributed-backend]
-# --8<-- [end:extra-information]

From 110df74332785ee749af47c5a3eb634d216b8f3b Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Mon, 7 Jul 2025 22:46:04 +0800
Subject: [PATCH 136/167] [Model][Last/4] Automatic conversion of CrossEncoding
 model (#19675)

Signed-off-by: wang.yuqi <noooop@126.com>
---
 docs/models/supported_models.md               |   8 +
 .../convert_model_to_seq_cls.py               | 134 +++++++++++++++++
 tests/models/language/pooling/mteb_utils.py   |   5 +-
 .../pooling/test_bge_reranker_v2_gemma.py     | 140 ++++++++++++++++++
 .../language/pooling/test_mxbai_rerank.py     |   2 -
 tests/models/registry.py                      |   7 +-
 vllm/config.py                                |   6 +
 vllm/entrypoints/llm.py                       |  12 +-
 vllm/entrypoints/openai/serving_score.py      |  18 ++-
 vllm/model_executor/models/adapters.py        |  48 ++++++
 vllm/model_executor/models/gemma.py           |   4 +
 vllm/model_executor/models/registry.py        |   3 +-
 12 files changed, 373 insertions(+), 14 deletions(-)
 create mode 100644 examples/offline_inference/convert_model_to_seq_cls.py
 create mode 100644 tests/models/language/pooling/test_bge_reranker_v2_gemma.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 422c406d5f318..f427968c8258f 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -481,11 +481,19 @@ Specified using `--task score`.
 | Architecture | Models | Example HF Models | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|---------------------|
 | `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | |
+| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | |
 | `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ |
 | `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ |
 | `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | |
 | `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | |
 
+!!! note
+    Load the official original `BAAI/bge-reranker-v2-gemma` by using the following command.
+
+    ```bash
+    vllm serve BAAI/bge-reranker-v2-gemma --hf_overrides '{"architectures": ["GemmaForSequenceClassification"],"classifier_from_token": ["Yes"],"method": "no_post_processing"}'
+    ```
+
 !!! note
     Load the official original `mxbai-rerank-v2` by using the following command.
 
diff --git a/examples/offline_inference/convert_model_to_seq_cls.py b/examples/offline_inference/convert_model_to_seq_cls.py
new file mode 100644
index 0000000000000..72356020330fe
--- /dev/null
+++ b/examples/offline_inference/convert_model_to_seq_cls.py
@@ -0,0 +1,134 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+
+import argparse
+import json
+
+import torch
+import transformers
+
+# Usage:
+# for BAAI/bge-reranker-v2-gemma
+# Caution: "Yes" and "yes" are two different tokens
+# python convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma --classifier_from_tokens '["Yes"]' --method no_post_processing --path ./bge-reranker-v2-gemma-seq-cls
+# for mxbai-rerank-v2
+# python convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 --classifier_from_tokens '["0", "1"]' --method from_2_way_softmax --path ./mxbai-rerank-base-v2-seq-cls
+# for Qwen3-Reranker
+# python convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B --classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax --path ./Qwen3-Reranker-0.6B-seq-cls
+
+
+def from_2_way_softmax(causal_lm, seq_cls_model, tokenizer, tokens, device):
+    # refer to https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
+    assert len(tokens) == 2
+
+    lm_head_weights = causal_lm.lm_head.weight
+
+    false_id = tokenizer.convert_tokens_to_ids(tokens[0])
+    true_id = tokenizer.convert_tokens_to_ids(tokens[1])
+
+    score_weight = lm_head_weights[true_id].to(device).to(
+        torch.float32
+    ) - lm_head_weights[false_id].to(device).to(torch.float32)
+
+    with torch.no_grad():
+        seq_cls_model.score.weight.copy_(score_weight.unsqueeze(0))
+        if seq_cls_model.score.bias is not None:
+            seq_cls_model.score.bias.zero_()
+
+
+def no_post_processing(causal_lm, seq_cls_model, tokenizer, tokens, device):
+    lm_head_weights = causal_lm.lm_head.weight
+
+    token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
+
+    score_weight = lm_head_weights[token_ids].to(device)
+
+    with torch.no_grad():
+        seq_cls_model.score.weight.copy_(score_weight)
+        if seq_cls_model.score.bias is not None:
+            seq_cls_model.score.bias.zero_()
+
+
+method_map = {
+    function.__name__: function for function in [from_2_way_softmax, no_post_processing]
+}
+
+
+def converting(
+    model_name, classifier_from_tokens, path, method, use_pad_token=False, device="cpu"
+):
+    assert method in method_map
+
+    if method == "from_2_way_softmax":
+        assert len(classifier_from_tokens) == 2
+        num_labels = 1
+    else:
+        num_labels = len(classifier_from_tokens)
+
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
+    causal_lm = transformers.AutoModelForCausalLM.from_pretrained(
+        model_name, device_map=device
+    )
+
+    seq_cls_model = transformers.AutoModelForSequenceClassification.from_pretrained(
+        model_name,
+        num_labels=num_labels,
+        ignore_mismatched_sizes=True,
+        device_map=device,
+    )
+
+    method_map[method](
+        causal_lm, seq_cls_model, tokenizer, classifier_from_tokens, device
+    )
+
+    # `llm as reranker` defaults to not using pad_token
+    seq_cls_model.config.use_pad_token = use_pad_token
+    seq_cls_model.config.pad_token_id = tokenizer.pad_token_id
+
+    seq_cls_model.save_pretrained(path)
+    tokenizer.save_pretrained(path)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Converting *ForCausalLM models to "
+        "*ForSequenceClassification models."
+    )
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="BAAI/bge-reranker-v2-gemma",
+        help="Model name",
+    )
+    parser.add_argument(
+        "--classifier_from_tokens",
+        type=str,
+        default='["Yes"]',
+        help="classifier from tokens",
+    )
+    parser.add_argument(
+        "--method", type=str, default="no_post_processing", help="Converting converting"
+    )
+    parser.add_argument(
+        "--use-pad-token", action="store_true", help="Whether to use pad_token"
+    )
+    parser.add_argument(
+        "--path",
+        type=str,
+        default="./bge-reranker-v2-gemma-seq-cls",
+        help="Path to save converted model",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    converting(
+        model_name=args.model_name,
+        classifier_from_tokens=json.loads(args.classifier_from_tokens),
+        method=args.method,
+        use_pad_token=args.use_pad_token,
+        path=args.path,
+    )
diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py
index a83d258185845..59336c1f79063 100644
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -267,7 +267,8 @@ def mteb_test_rerank_models(hf_runner,
                             vllm_runner,
                             model_info: RerankModelInfo,
                             vllm_extra_kwargs=None,
-                            hf_model_callback=None):
+                            hf_model_callback=None,
+                            vllm_mteb_encoder=VllmMtebEncoder):
     if not model_info.enable_test:
         # A model family has many models with the same architecture,
         # and we don't need to test each one.
@@ -288,7 +289,7 @@ def mteb_test_rerank_models(hf_runner,
             assert (model_info.architecture in model_config.architectures)
         assert model_config.hf_config.num_labels == 1
 
-        vllm_main_score = run_mteb_rerank(VllmMtebEncoder(vllm_model),
+        vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
                                           tasks=MTEB_RERANK_TASKS,
                                           languages=MTEB_RERANK_LANGS)
         vllm_dtype = model_config.dtype
diff --git a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
new file mode 100644
index 0000000000000..7fa9485dbc7f7
--- /dev/null
+++ b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
@@ -0,0 +1,140 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional
+
+import numpy as np
+import pytest
+import torch
+
+from tests.conftest import HfRunner
+
+from .mteb_utils import (RerankModelInfo, VllmMtebEncoder,
+                         mteb_test_rerank_models)
+
+RERANK_MODELS = [
+    RerankModelInfo("BAAI/bge-reranker-v2-gemma",
+                    architecture="GemmaForSequenceClassification"),
+]
+
+PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."  # noqa: E501
+
+
+class GemmaRerankerHfRunner(HfRunner):
+
+    def __init__(self,
+                 model_name: str,
+                 dtype: str = "auto",
+                 *args: Any,
+                 **kwargs: Any) -> None:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                                       padding_side='left')
+        self.yes_loc = self.tokenizer.convert_tokens_to_ids("Yes")
+
+    @torch.no_grad()
+    def predict(self, prompts: list[list[str]], *args,
+                **kwargs) -> torch.Tensor:
+
+        def get_inputs(pairs, tokenizer, prompt=None):
+            if prompt is None:
+                prompt = PROMPT
+
+            sep = "\n"
+            prompt_inputs = tokenizer(prompt,
+                                      return_tensors=None,
+                                      add_special_tokens=False)["input_ids"]
+            sep_inputs = tokenizer(sep,
+                                   return_tensors=None,
+                                   add_special_tokens=False)["input_ids"]
+            inputs = []
+            for query, passage in pairs:
+                query_inputs = tokenizer(
+                    f"A: {query}",
+                    return_tensors=None,
+                    add_special_tokens=False,
+                    truncation=True,
+                )
+                passage_inputs = tokenizer(
+                    f"B: {passage}",
+                    return_tensors=None,
+                    add_special_tokens=False,
+                    truncation=True,
+                )
+                item = tokenizer.prepare_for_model(
+                    [tokenizer.bos_token_id] + query_inputs["input_ids"],
+                    sep_inputs + passage_inputs["input_ids"],
+                    truncation="only_second",
+                    padding=False,
+                    return_attention_mask=False,
+                    return_token_type_ids=False,
+                    add_special_tokens=False,
+                )
+                item["input_ids"] = item[
+                    "input_ids"] + sep_inputs + prompt_inputs
+                item["attention_mask"] = [1] * len(item["input_ids"])
+                inputs.append(item)
+            return tokenizer.pad(
+                inputs,
+                padding=True,
+                return_tensors="pt",
+            )
+
+        scores = []
+        for query, doc, *_ in prompts:
+            pairs = [(query, doc)]
+            inputs = get_inputs(pairs, self.tokenizer)
+            inputs = inputs.to(self.model.device)
+            _n_tokens = inputs["input_ids"].shape[1]
+            logits = self.model(**inputs, return_dict=True).logits
+            _scores = (logits[:, -1,
+                              self.yes_loc].view(-1, ).float().sigmoid())
+            scores.append(_scores[0].item())
+        return torch.Tensor(scores)
+
+
+class GemmaMtebEncoder(VllmMtebEncoder):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.prompt = PROMPT
+        self.query_template = "A: {query}\n"
+        self.document_template = "B: {doc}\n{prompt}"
+
+    def predict(
+        self,
+        sentences: list[tuple[str, str,
+                              Optional[str]]],  # query, corpus, prompt
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
+
+        _sentences = []
+        for query, corpus, prompt in sentences:
+            query = self.query_template.format(query=query)
+            corpus = self.document_template.format(doc=corpus, prompt=prompt)
+            _sentences.append((query, corpus, prompt))
+
+        return super().predict(_sentences, *args, **kwargs)
+
+
+@pytest.mark.parametrize("model_info", RERANK_MODELS)
+def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo,
+                            monkeypatch) -> None:
+    monkeypatch.setenv("VLLM_USE_V1", "0")
+
+    assert model_info.architecture == "GemmaForSequenceClassification"
+
+    vllm_extra_kwargs: dict[str, Any] = {
+        "hf_overrides": {
+            "architectures": ["GemmaForSequenceClassification"],
+            "classifier_from_token": ["Yes"],
+            "method": "no_post_processing",
+        }
+    }
+
+    mteb_test_rerank_models(GemmaRerankerHfRunner,
+                            vllm_runner,
+                            model_info,
+                            vllm_extra_kwargs,
+                            vllm_mteb_encoder=GemmaMtebEncoder)
diff --git a/tests/models/language/pooling/test_mxbai_rerank.py b/tests/models/language/pooling/test_mxbai_rerank.py
index a1293a95bfd5f..e74c58744dd2e 100644
--- a/tests/models/language/pooling/test_mxbai_rerank.py
+++ b/tests/models/language/pooling/test_mxbai_rerank.py
@@ -12,11 +12,9 @@ from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
 RERANK_MODELS = [
     RerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
                     architecture="Qwen2ForSequenceClassification",
-                    dtype="float32",
                     enable_test=True),
     RerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
                     architecture="Qwen2ForSequenceClassification",
-                    dtype="float32",
                     enable_test=False)
 ]
 
diff --git a/tests/models/registry.py b/tests/models/registry.py
index aba01cefe993e..48302f9d66484 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -319,9 +319,14 @@ _EMBEDDING_EXAMPLE_MODELS = {
 _CROSS_ENCODER_EXAMPLE_MODELS = {
     # [Text-only]
     "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2", v0_only=True),  # noqa: E501
+    "GemmaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-gemma", # noqa: E501
+                                                      v0_only=True,
+                                                      hf_overrides={"architectures": ["GemmaForSequenceClassification"], # noqa: E501
+                                                                    "classifier_from_token": ["Yes"], # noqa: E501
+                                                                    "method": "no_post_processing"}), # noqa: E501
+    "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base", v0_only=True), # noqa: E501
     "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base", v0_only=True),  # noqa: E501
     "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3", v0_only=True),  # noqa: E501
-    "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base", v0_only=True),  # noqa: E501
 }
 
 _MULTIMODAL_EXAMPLE_MODELS = {
diff --git a/vllm/config.py b/vllm/config.py
index 724f69a3887f0..b7ba434db9179 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1449,6 +1449,12 @@ class ModelConfig:
     def matryoshka_dimensions(self):
         return getattr(self.hf_config, "matryoshka_dimensions", None)
 
+    @property
+    def use_pad_token(self) -> bool:
+        # cross_encoder models defaults to using pad_token.
+        # `llm as reranker` models defaults to not using pad_token.
+        return getattr(self.hf_config, "use_pad_token", True)
+
     def get_and_verify_max_len(self, max_model_len: int):
         # For pooling models, the tokenizer's `model_max_length` is often a
         # reliable source for the maximum sequence length. However, for
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 6357c2a37c8fb..16c051d61de32 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1205,7 +1205,6 @@ class LLM:
         input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]
 
         pooling_params = PoolingParams(use_cross_encoder=True)
-
         tokenization_kwargs: dict[str, Any] = {}
         _validate_truncation_size(self.llm_engine.model_config.max_model_len,
                                   truncate_prompt_tokens, tokenization_kwargs)
@@ -1213,9 +1212,14 @@ class LLM:
         parsed_prompts = []
 
         for q, t in input_pairs:
-            prompt_inputs = tokenizer(text=q,
-                                      text_pair=t,
-                                      **tokenization_kwargs)
+            if self.llm_engine.model_config.use_pad_token:
+                # cross_encoder models defaults to using pad_token.
+                prompt_inputs = tokenizer(text=q,
+                                          text_pair=t,
+                                          **tokenization_kwargs)
+            else:
+                # `llm as reranker` models defaults to not using pad_token.
+                prompt_inputs = tokenizer(text=q + t, **tokenization_kwargs)
             engine_prompt = TokensPrompt(
                 prompt_token_ids=prompt_inputs["input_ids"],
                 token_type_ids=prompt_inputs.get("token_type_ids"))
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 328d4ff0e6c01..8b2e3e507c4db 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -167,12 +167,22 @@ class ServingScores(OpenAIServing):
                                     executor=self._tokenizer_executor)
 
         tokenization_kwargs = tokenization_kwargs or {}
-        tokenized_prompts = await asyncio.gather(
-            *(tokenize_async(text=t1, text_pair=t2, **tokenization_kwargs)
-              for t1, t2 in input_pairs))
+        use_pad_token = self.model_config.use_pad_token
+
+        if use_pad_token:
+            # cross_encoder models defaults to using pad_token.
+            tokenized_prompts = await asyncio.gather(
+                *(tokenize_async(text=t1, text_pair=t2, **tokenization_kwargs)
+                  for t1, t2 in input_pairs))
+        else:
+            # `llm as reranker` models defaults to not using pad_token.
+            tokenized_prompts = await asyncio.gather(
+                *(tokenize_async(text=t1 + t2, **tokenization_kwargs)
+                  for t1, t2 in input_pairs))
 
         for prompt_inputs, (t1, t2) in zip(tokenized_prompts, input_pairs):
-            sep_token = tokenizer.sep_token if tokenizer.sep_token else ''
+            sep_token = tokenizer.sep_token if (tokenizer.sep_token
+                                                and use_pad_token) else ''
             request_prompt = f"{t1}{sep_token}{t2}"
 
             input_ids = prompt_inputs["input_ids"]
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 78d86f6f20445..6584c84436c2f 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -312,6 +312,10 @@ class SequenceClassificationConfig(VerifyAndUpdateConfig):
         else:
             config.num_labels = len(tokens)
 
+        # `llm as reranker` defaults to not using pad_token
+        use_pad_token = getattr(config, "use_pad_token", False)
+        config.use_pad_token = use_pad_token
+
 
 def load_weights_using_from_2_way_softmax(
         model, weights: Iterable[tuple[str, torch.Tensor]]):
@@ -356,8 +360,49 @@ def load_weights_using_from_2_way_softmax(
     return loaded_weights
 
 
+def load_weights_no_post_processing(model,
+                                    weights: Iterable[tuple[str,
+                                                            torch.Tensor]]):
+    from vllm.model_executor.layers.vocab_parallel_embedding import (
+        ParallelLMHead)
+    from vllm.model_executor.models.utils import AutoWeightsLoader
+
+    model_config = model.vllm_config.model_config
+    tokens = getattr(model.config, "classifier_from_token", [])
+    tokens = cast(list[int], tokens)
+    assert len(tokens) > 0
+
+    device = model.score.weight.device
+
+    if model.config.tie_word_embeddings:
+        model.lm_head = model.model.embed_tokens
+    else:
+        model.lm_head = ParallelLMHead(model.config.vocab_size,
+                                       model.config.hidden_size,
+                                       quant_config=model.quant_config)
+
+    loader = AutoWeightsLoader(model)
+    loaded_weights = loader.load_weights(weights)
+
+    from vllm.transformers_utils.tokenizer import get_tokenizer
+    tokenizer = get_tokenizer(model_config.tokenizer,
+                              revision=model_config.tokenizer_revision,
+                              tokenizer_mode=model_config.tokenizer_mode,
+                              trust_remote_code=model_config.trust_remote_code)
+
+    token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
+    score_weight = model.lm_head.weight.data[token_ids].to(device)
+    model.score.weight.data.copy_(score_weight)
+
+    del model.lm_head
+    loaded_weights.add("score.weight")
+    loaded_weights.discard("lm_head.weight")
+    return loaded_weights
+
+
 SEQ_CLS_LOAD_METHODS = {
     "from_2_way_softmax": load_weights_using_from_2_way_softmax,
+    "no_post_processing": load_weights_no_post_processing,
 }
 
 
@@ -368,6 +413,9 @@ def seq_cls_model_loader(model, weights: Iterable[tuple[str, torch.Tensor]]):
     #     - Qwen3-Reranker
     #   - Qwen2ForCausalLM
     #     - mxbai-rerank-v2
+    # - no_post_processing:
+    #   - GemmaForCausalLM
+    #     - bge-reranker-v2-gemma
 
     config = model.vllm_config.model_config.hf_config
     method = getattr(config, "method", None)
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 59c3102add4c7..bc8179f886fdc 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -43,6 +43,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
+from .adapters import as_seq_cls_model
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
@@ -425,3 +426,6 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                            if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
+
+
+GemmaForSequenceClassification = as_seq_cls_model(GemmaForCausalLM)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index b100fe77e3772..27d4769298553 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -179,8 +179,9 @@ _CROSS_ENCODER_MODELS = {
     "ModernBertForSequenceClassification": ("modernbert",
                                             "ModernBertForSequenceClassification"),
     # [Auto-converted (see adapters.py)]
+    "GemmaForSequenceClassification": ("gemma", "GemmaForSequenceClassification"), # noqa: E501
     "Qwen2ForSequenceClassification": ("qwen2", "Qwen2ForSequenceClassification"), # noqa: E501
-    "Qwen3ForSequenceClassification": ("qwen3", "Qwen3ForSequenceClassification"),  # noqa: E501
+    "Qwen3ForSequenceClassification": ("qwen3", "Qwen3ForSequenceClassification"), # noqa: E501
 }
 
 _MULTIMODAL_MODELS = {

From edd270bc781e5a3d250827ac7039d71b8562bb04 Mon Sep 17 00:00:00 2001
From: Peter Pan <peter.pan@daocloud.io>
Date: Tue, 8 Jul 2025 00:41:15 +0800
Subject: [PATCH 137/167] [Bugfix] Prevent IndexError for cached requests when
 pipeline parallelism is disabled (#20486)

Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
---
 vllm/v1/core/sched/scheduler.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index fe552db74e2f2..79ab482bd71df 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -635,6 +635,8 @@ class Scheduler(SchedulerInterface):
                 token_ids = req.all_token_ids[req.num_computed_tokens:req.
                                               num_computed_tokens + num_tokens]
                 new_token_ids.append(token_ids)
+            else:
+                new_token_ids.append([])
             new_block_ids.append(req_to_new_block_ids[req_id])
             num_computed_tokens.append(req.num_computed_tokens)
         # Because resumed_reqs is usually empty, it is more efficient to do

From a37d75bbec7ea1a1cadb738bf54b915875855428 Mon Sep 17 00:00:00 2001
From: ztang2370 <ztang2370@gmail.com>
Date: Tue, 8 Jul 2025 00:54:10 +0800
Subject: [PATCH 138/167] [Front-end] microbatch tokenization (#19334)

Signed-off-by: zt2370 <ztang2370@gmail.com>
---
 tests/entrypoints/openai/test_serving_chat.py |  39 ++--
 vllm/entrypoints/openai/serving_engine.py     | 121 ++++++-----
 vllm/utils/__init__.py                        | 192 ++++++++++++++++++
 3 files changed, 288 insertions(+), 64 deletions(-)

diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index ad80946b5671f..8a7892cf6d6aa 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -7,6 +7,8 @@ from dataclasses import dataclass, field
 from typing import Any, Optional
 from unittest.mock import MagicMock
 
+import pytest
+
 from vllm.config import MultiModalConfig
 from vllm.engine.multiprocessing.client import MQLLMEngineClient
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
@@ -73,7 +75,8 @@ def test_async_serving_chat_init():
     assert serving_completion.chat_template == CHAT_TEMPLATE
 
 
-def test_serving_chat_should_set_correct_max_tokens():
+@pytest.mark.asyncio
+async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine = MagicMock(spec=MQLLMEngineClient)
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
@@ -88,6 +91,7 @@ def test_serving_chat_should_set_correct_max_tokens():
                                      chat_template=CHAT_TEMPLATE,
                                      chat_template_content_format="auto",
                                      request_logger=None)
+
     req = ChatCompletionRequest(
         model=MODEL_NAME,
         messages=[{
@@ -98,13 +102,13 @@ def test_serving_chat_should_set_correct_max_tokens():
     )
 
     with suppress(Exception):
-        asyncio.run(serving_chat.create_chat_completion(req))
+        await serving_chat.create_chat_completion(req)
 
     assert mock_engine.generate.call_args.args[1].max_tokens == 93
 
     req.max_tokens = 10
     with suppress(Exception):
-        asyncio.run(serving_chat.create_chat_completion(req))
+        await serving_chat.create_chat_completion(req)
 
     assert mock_engine.generate.call_args.args[1].max_tokens == 10
 
@@ -143,7 +147,7 @@ def test_serving_chat_should_set_correct_max_tokens():
     )
 
     with suppress(Exception):
-        asyncio.run(serving_chat.create_chat_completion(req))
+        await serving_chat.create_chat_completion(req)
 
     assert mock_engine.generate.call_args.args[1].max_tokens == 10
 
@@ -151,7 +155,7 @@ def test_serving_chat_should_set_correct_max_tokens():
     req.max_tokens = 15
 
     with suppress(Exception):
-        asyncio.run(serving_chat.create_chat_completion(req))
+        await serving_chat.create_chat_completion(req)
 
     assert mock_engine.generate.call_args.args[1].max_tokens == 10
 
@@ -159,7 +163,7 @@ def test_serving_chat_should_set_correct_max_tokens():
     req.max_tokens = 5
 
     with suppress(Exception):
-        asyncio.run(serving_chat.create_chat_completion(req))
+        await serving_chat.create_chat_completion(req)
 
     assert mock_engine.generate.call_args.args[1].max_tokens == 5
 
@@ -198,7 +202,7 @@ def test_serving_chat_should_set_correct_max_tokens():
     )
 
     with suppress(Exception):
-        asyncio.run(serving_chat.create_chat_completion(req))
+        await serving_chat.create_chat_completion(req)
 
     assert mock_engine.generate.call_args.args[1].max_tokens == 93
 
@@ -206,7 +210,7 @@ def test_serving_chat_should_set_correct_max_tokens():
     req.max_tokens = 100
 
     with suppress(Exception):
-        asyncio.run(serving_chat.create_chat_completion(req))
+        await serving_chat.create_chat_completion(req)
 
     assert mock_engine.generate.call_args.args[1].max_tokens == 93
 
@@ -214,12 +218,13 @@ def test_serving_chat_should_set_correct_max_tokens():
     req.max_tokens = 5
 
     with suppress(Exception):
-        asyncio.run(serving_chat.create_chat_completion(req))
+        await serving_chat.create_chat_completion(req)
 
     assert mock_engine.generate.call_args.args[1].max_tokens == 5
 
 
-def test_serving_chat_could_load_correct_generation_config():
+@pytest.mark.asyncio
+async def test_serving_chat_could_load_correct_generation_config():
 
     mock_model_config = MockModelConfig()
     mock_model_config.diff_sampling_param = {
@@ -242,6 +247,7 @@ def test_serving_chat_could_load_correct_generation_config():
                                      chat_template=CHAT_TEMPLATE,
                                      chat_template_content_format="auto",
                                      request_logger=None)
+
     req = ChatCompletionRequest(
         model=MODEL_NAME,
         messages=[{
@@ -252,7 +258,7 @@ def test_serving_chat_could_load_correct_generation_config():
     )
 
     with suppress(Exception):
-        asyncio.run(serving_chat.create_chat_completion(req))
+        await serving_chat.create_chat_completion(req)
 
     assert mock_engine.generate.call_args.args[1].temperature == 0.5
     assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
@@ -261,7 +267,7 @@ def test_serving_chat_could_load_correct_generation_config():
     req.temperature = 0.1
 
     with suppress(Exception):
-        asyncio.run(serving_chat.create_chat_completion(req))
+        await serving_chat.create_chat_completion(req)
 
     assert mock_engine.generate.call_args.args[1].temperature == 0.1
     assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
@@ -270,13 +276,14 @@ def test_serving_chat_could_load_correct_generation_config():
     req.temperature = 0.0
 
     with suppress(Exception):
-        asyncio.run(serving_chat.create_chat_completion(req))
+        await serving_chat.create_chat_completion(req)
 
     assert mock_engine.generate.call_args.args[1].temperature == 0.0
     assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
 
 
-def test_serving_chat_did_set_correct_cache_salt():
+@pytest.mark.asyncio
+async def test_serving_chat_did_set_correct_cache_salt():
     mock_model_config = MockModelConfig()
 
     mock_engine = MagicMock(spec=MQLLMEngineClient)
@@ -306,11 +313,11 @@ def test_serving_chat_did_set_correct_cache_salt():
 
     # By default cache_salt in the engine prompt is not set
     with suppress(Exception):
-        asyncio.run(serving_chat.create_chat_completion(req))
+        await serving_chat.create_chat_completion(req)
     assert "cache_salt" not in mock_engine.generate.call_args.args[0]
 
     # Test with certain cache_salt
     req.cache_salt = "test_salt"
     with suppress(Exception):
-        asyncio.run(serving_chat.create_chat_completion(req))
+        await serving_chat.create_chat_completion(req)
     assert mock_engine.generate.call_args.args[0]["cache_salt"] == "test_salt"
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index c4ebb7141d09d..bec2e1254795c 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1,13 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
 import base64
 import io
 import json
 import sys
 import time
-from collections.abc import (AsyncGenerator, Iterable, Iterator, Mapping,
-                             Sequence)
-from concurrent.futures.thread import ThreadPoolExecutor
+from collections.abc import AsyncGenerator, Iterable, Mapping, Sequence
+from concurrent.futures import ThreadPoolExecutor
 from http import HTTPStatus
 from typing import (Annotated, Any, Callable, ClassVar, Generic, Optional,
                     TypeVar, Union, cast, overload)
@@ -79,8 +79,8 @@ from vllm.sequence import Logprob, PromptLogprobs
 from vllm.tracing import (contains_trace_headers, extract_trace_headers,
                           log_tracing_disabled_warning)
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import (is_list_of, make_async, merge_async_iterators,
-                        random_uuid)
+from vllm.utils import (AsyncMicrobatchTokenizer, is_list_of,
+                        merge_async_iterators, random_uuid)
 
 logger = init_logger(__name__)
 
@@ -226,11 +226,19 @@ class OpenAIServing:
 
         self._tokenizer_executor = ThreadPoolExecutor(max_workers=1)
 
-        self._tokenize_prompt_input_async = make_async(
-            self._tokenize_prompt_input, executor=self._tokenizer_executor)
-        self._tokenize_prompt_input_or_inputs_async = make_async(
-            self._tokenize_prompt_input_or_inputs,
-            executor=self._tokenizer_executor)
+        self._async_tokenizer_pool: dict[AnyTokenizer,
+                                         AsyncMicrobatchTokenizer] = {}
+
+    def _get_async_tokenizer(self, tokenizer) -> AsyncMicrobatchTokenizer:
+        """
+        Return (and cache) an `AsyncMicrobatchTokenizer` bound to the 
+        given tokenizer.
+        """
+        async_tokenizer = self._async_tokenizer_pool.get(tokenizer)
+        if async_tokenizer is None:
+            async_tokenizer = AsyncMicrobatchTokenizer(tokenizer)
+            self._async_tokenizer_pool[tokenizer] = async_tokenizer
+        return async_tokenizer
 
     async def _preprocess(
         self,
@@ -467,7 +475,7 @@ class OpenAIServing:
         # if _check_model has been called earlier, this will be unreachable
         raise ValueError(f"The model `{request.model}` does not exist.")
 
-    def _normalize_prompt_text_to_input(
+    async def _normalize_prompt_text_to_input(
         self,
         request: AnyRequest,
         tokenizer: AnyTokenizer,
@@ -475,38 +483,44 @@ class OpenAIServing:
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]],
         add_special_tokens: bool,
     ) -> TextTokensPrompt:
+        async_tokenizer = self._get_async_tokenizer(tokenizer)
+
         if (self.model_config.encoder_config is not None
                 and self.model_config.encoder_config.get(
                     "do_lower_case", False)):
             prompt = prompt.lower()
 
         if truncate_prompt_tokens is None:
-            encoded = tokenizer(prompt, add_special_tokens=add_special_tokens)
+            encoded = await async_tokenizer(
+                prompt, add_special_tokens=add_special_tokens)
         elif truncate_prompt_tokens < 0:
             # Negative means we cap at the model's max length
-            encoded = tokenizer(prompt,
-                                add_special_tokens=add_special_tokens,
-                                truncation=True,
-                                max_length=self.max_model_len)
+            encoded = await async_tokenizer(
+                prompt,
+                add_special_tokens=add_special_tokens,
+                truncation=True,
+                max_length=self.max_model_len)
         else:
-            encoded = tokenizer(prompt,
-                                add_special_tokens=add_special_tokens,
-                                truncation=True,
-                                max_length=truncate_prompt_tokens)
+            encoded = await async_tokenizer(
+                prompt,
+                add_special_tokens=add_special_tokens,
+                truncation=True,
+                max_length=truncate_prompt_tokens)
 
         input_ids = encoded.input_ids
-
         input_text = prompt
 
         return self._validate_input(request, input_ids, input_text)
 
-    def _normalize_prompt_tokens_to_input(
+    async def _normalize_prompt_tokens_to_input(
         self,
         request: AnyRequest,
         tokenizer: AnyTokenizer,
         prompt_ids: list[int],
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]],
     ) -> TextTokensPrompt:
+        async_tokenizer = self._get_async_tokenizer(tokenizer)
+
         if truncate_prompt_tokens is None:
             input_ids = prompt_ids
         elif truncate_prompt_tokens < 0:
@@ -514,7 +528,7 @@ class OpenAIServing:
         else:
             input_ids = prompt_ids[-truncate_prompt_tokens:]
 
-        input_text = tokenizer.decode(input_ids)
+        input_text = await async_tokenizer.decode(input_ids)
 
         return self._validate_input(request, input_ids, input_text)
 
@@ -578,7 +592,7 @@ class OpenAIServing:
 
         return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
 
-    def _tokenize_prompt_input(
+    async def _tokenize_prompt_input_async(
         self,
         request: AnyRequest,
         tokenizer: AnyTokenizer,
@@ -591,23 +605,24 @@ class OpenAIServing:
         [`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs]
         that assumes single input.
         """
-        return next(
-            self._tokenize_prompt_inputs(
+        async for result in self._tokenize_prompt_inputs_async(
                 request,
                 tokenizer,
-                [prompt_input],
+            [prompt_input],
                 truncate_prompt_tokens=truncate_prompt_tokens,
                 add_special_tokens=add_special_tokens,
-            ))
+        ):
+            return result
+        raise ValueError("No results yielded from tokenization")
 
-    def _tokenize_prompt_inputs(
+    async def _tokenize_prompt_inputs_async(
         self,
         request: AnyRequest,
         tokenizer: AnyTokenizer,
         prompt_inputs: Iterable[Union[str, list[int]]],
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,
         add_special_tokens: bool = True,
-    ) -> Iterator[TextTokensPrompt]:
+    ) -> AsyncGenerator[TextTokensPrompt, None]:
         """
         A simpler implementation of
         [`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs]
@@ -615,7 +630,7 @@ class OpenAIServing:
         """
         for text in prompt_inputs:
             if isinstance(text, str):
-                yield self._normalize_prompt_text_to_input(
+                yield await self._normalize_prompt_text_to_input(
                     request,
                     tokenizer,
                     prompt=text,
@@ -623,14 +638,14 @@ class OpenAIServing:
                     add_special_tokens=add_special_tokens,
                 )
             else:
-                yield self._normalize_prompt_tokens_to_input(
+                yield await self._normalize_prompt_tokens_to_input(
                     request,
                     tokenizer,
                     prompt_ids=text,
                     truncate_prompt_tokens=truncate_prompt_tokens,
                 )
 
-    def _tokenize_prompt_input_or_inputs(
+    async def _tokenize_prompt_input_or_inputs_async(
         self,
         request: AnyRequest,
         tokenizer: AnyTokenizer,
@@ -664,21 +679,31 @@ class OpenAIServing:
         # VSCode Pyright extension should still work properly
         # "is False" is required for Pyright to perform type narrowing
         # See: https://github.com/microsoft/pyright/issues/7672
-        inputs_text.extend([
-            self._normalize_prompt_text_to_input(
-                request,
-                tokenizer,
-                prompt=prompt_input["content"],
-                truncate_prompt_tokens=truncate_prompt_tokens,
-                add_special_tokens=add_special_tokens)
-            if prompt_input["is_tokens"] is False else
-            self._normalize_prompt_tokens_to_input(
-                request,
-                tokenizer,
-                prompt_ids=prompt_input["content"],
-                truncate_prompt_tokens=truncate_prompt_tokens)
-            for prompt_input in parse_and_batch_prompt(input_or_inputs)
-        ])
+
+        # Parse and batch the input prompts
+        batch_inputs = parse_and_batch_prompt(input_or_inputs)
+
+        # Process each input in the batch concurrently
+        tasks = []
+        for prompt_input in batch_inputs:
+            if prompt_input["is_tokens"] is False:
+                task = self._normalize_prompt_text_to_input(
+                    request,
+                    tokenizer,
+                    prompt_input["content"],
+                    truncate_prompt_tokens=truncate_prompt_tokens,
+                    add_special_tokens=add_special_tokens)
+            else:
+                task = self._normalize_prompt_tokens_to_input(
+                    request,
+                    tokenizer,
+                    prompt_input["content"],
+                    truncate_prompt_tokens=truncate_prompt_tokens)
+            tasks.append(task)
+
+        # Wait for all tokenization tasks to complete
+        results = await asyncio.gather(*tasks)
+        inputs_text.extend(results)
 
         return inputs_text, inputs_embeds
 
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 9322e3cc477a0..bfdbd682464a8 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -41,6 +41,7 @@ from collections import UserDict, defaultdict
 from collections.abc import (AsyncGenerator, Awaitable, Collection, Generator,
                              Hashable, Iterable, Iterator, KeysView, Mapping,
                              Sequence)
+from concurrent.futures import ThreadPoolExecutor
 from concurrent.futures.process import ProcessPoolExecutor
 from dataclasses import dataclass, field
 from functools import cache, lru_cache, partial, wraps
@@ -64,6 +65,7 @@ import zmq.asyncio
 from packaging import version
 from packaging.version import Version
 from torch.library import Library
+from transformers.tokenization_utils_base import BatchEncoding
 from typing_extensions import Never, ParamSpec, TypeIs, assert_never
 
 import vllm.envs as envs
@@ -507,6 +509,196 @@ def random_uuid() -> str:
     return str(uuid.uuid4().hex)
 
 
+class AsyncMicrobatchTokenizer:
+    """Asynchronous tokenizer with micro-batching.
+
+    Pulls pending encode/decode requests from a queue and batches them 
+    up to reduce overhead. A single-thread ThreadPoolExecutor is used 
+    so the event loop stays responsive.
+    """
+
+    def __init__(
+        self,
+        tokenizer,
+        max_batch_size: int = 32,
+        batch_wait_timeout_s: float = 0.002,
+    ) -> None:
+        self.tokenizer = tokenizer
+        self.max_batch_size = max_batch_size
+        self.batch_wait_timeout_s = batch_wait_timeout_s
+
+        self._loop = asyncio.get_running_loop()
+        self._queues: dict[tuple,
+                           asyncio.Queue[Union[tuple[str, dict,
+                                                     asyncio.Future],
+                                               tuple[list[int],
+                                                     asyncio.Future]]]] = {}
+        self._batcher_tasks: list[asyncio.Task] = []
+
+        # Single-thread executor for blocking tokenizer calls.
+        self._executor = ThreadPoolExecutor(max_workers=1)
+
+    # === Public async API ===
+    async def __call__(self, prompt, **kwargs):
+        result_future: asyncio.Future = self._loop.create_future()
+        key = self._queue_key("encode", kwargs)
+        queue = self._get_queue(self._loop, key)
+        await queue.put((prompt, kwargs, result_future))
+        return await result_future
+
+    async def decode(self, token_ids, **kwargs):
+        result_future: asyncio.Future = self._loop.create_future()
+        key = self._queue_key("decode", kwargs)
+        queue = self._get_queue(self._loop, key)
+        await queue.put((token_ids, result_future))
+        return await result_future
+
+    # === Internal helpers ===
+    def _get_queue(
+        self, loop: asyncio.AbstractEventLoop, key: tuple
+    ) -> asyncio.Queue[Union[tuple[str, dict, asyncio.Future], tuple[
+            list[int], asyncio.Future]]]:
+        """Get the request queue for the given operation key, creating a new
+        queue and batcher task if needed."""
+        queue = self._queues.get(key)
+        if queue is None:
+            self._queues[key] = queue = asyncio.Queue()
+            if key[0] == "encode":
+                can_batch = key[1] != "other"
+                coro = self._batch_encode_loop(queue, can_batch)
+            else:
+                assert key[0] == "decode", \
+                    f"Unknown operation type: {key[0]}."
+                coro = self._batch_decode_loop(queue)
+            self._batcher_tasks.append(loop.create_task(coro))
+        return queue
+
+    async def _batch_encode_loop(self, queue: asyncio.Queue, can_batch: bool):
+        """Batch incoming encode requests for efficiency."""
+        while True:
+            prompt, kwargs, result_future = await queue.get()
+            prompts = [prompt]
+            kwargs_list = [kwargs]
+            result_futures = [result_future]
+            deadline = self._loop.time() + self.batch_wait_timeout_s
+
+            while len(prompts) < self.max_batch_size:
+                timeout = deadline - self._loop.time()
+                if timeout <= 0:
+                    break
+                try:
+                    prompt, kwargs, result_future = await asyncio.wait_for(
+                        queue.get(), timeout)
+                    prompts.append(prompt)
+                    result_futures.append(result_future)
+                    if not can_batch:
+                        kwargs_list.append(kwargs)
+                except asyncio.TimeoutError:
+                    break
+
+            try:
+                # If every request uses identical kwargs we can run a single
+                # batched tokenizer call for a big speed-up.
+                if can_batch and len(prompts) > 1:
+                    encode_fn = partial(self.tokenizer, prompts, **kwargs)
+                    results = await self._loop.run_in_executor(
+                        self._executor, encode_fn)
+
+                    for i, fut in enumerate(result_futures):
+                        if not fut.done():
+                            data = {k: v[i] for k, v in results.items()}
+                            fut.set_result(BatchEncoding(data))
+                else:
+                    encode_fn = lambda prompts=prompts, kwargs=kwargs_list: [
+                        self.tokenizer(p, **kw)
+                        for p, kw in zip(prompts, kwargs)
+                    ]
+                    results = await self._loop.run_in_executor(
+                        self._executor, encode_fn)
+
+                    for fut, res in zip(result_futures, results):
+                        if not fut.done():
+                            fut.set_result(res)
+            except Exception as e:
+                for fut in result_futures:
+                    if not fut.done():
+                        fut.set_exception(e)
+
+    async def _batch_decode_loop(self, queue: asyncio.Queue):
+        """Batch incoming decode requests for efficiency."""
+        while True:
+            token_ids, result_future = await queue.get()
+            token_ids_list = [token_ids]
+            result_futures = [result_future]
+            deadline = self._loop.time() + self.batch_wait_timeout_s
+
+            while len(token_ids_list) < self.max_batch_size:
+                timeout = deadline - self._loop.time()
+                if timeout <= 0:
+                    break
+                try:
+                    token_ids, result_future = await asyncio.wait_for(
+                        queue.get(), timeout)
+                    token_ids_list.append(token_ids)
+                    result_futures.append(result_future)
+                except asyncio.TimeoutError:
+                    break
+
+            try:
+                # Perform a single batched decode call for all requests
+                results = await self._loop.run_in_executor(
+                    self._executor, self.tokenizer.batch_decode,
+                    token_ids_list)
+                for fut, res in zip(result_futures, results):
+                    if not fut.done():
+                        fut.set_result(res)
+            except Exception as e:
+                for fut in result_futures:
+                    if not fut.done():
+                        fut.set_exception(e)
+
+    def _queue_key(self, op: str, kwargs: dict) -> tuple:
+        """
+        Return a normalized key describing operation + kwargs.
+        
+        - `add_special_tokens`: {True/False}
+        - `truncation`: {True/False}
+          - If `truncation` is False (`max_length` is None), 
+            returns a key for a can_batch queue.
+          - If `truncation` is True and `max_length` is None or equals
+            `tokenizer.model_max_length`, returns a key for a can_batch queue.
+          - Otherwise, returns a key for a cannot_batch queue.
+        
+        Examples:
+          - Decode: ("decode",)
+          - Encode typical: 
+            ("encode", add_special_tokens, bool_truncation, max_length_label)
+          - Fallback: ("encode", "other")
+        """
+
+        if op == "decode":
+            return ("decode", )
+
+        add_special_tokens = kwargs.get("add_special_tokens", True)
+        truncation = kwargs.get("truncation", False)
+        max_length = kwargs.get("max_length")
+
+        if not truncation:
+            return ("encode", add_special_tokens, False, None)
+
+        model_max = getattr(self.tokenizer, "model_max_length", None)
+        if max_length is None or (model_max is not None
+                                  and max_length == model_max):
+            return ("encode", add_special_tokens, True, "model_max")
+
+        return ("encode", "other")
+
+    def __del__(self):
+        for task in self._batcher_tasks:
+            if not task.done():
+                task.cancel()
+
+
 def make_async(
     func: Callable[P, T],
     executor: Optional[concurrent.futures.Executor] = None

From a6d795d593046abd490b16349bcd9b40feedd334 Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Mon, 7 Jul 2025 10:14:22 -0700
Subject: [PATCH 139/167] [DP] Copy environment variables to Ray
 DPEngineCoreActors (#20344)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 vllm/executor/ray_distributed_executor.py | 33 ++---------
 vllm/ray/ray_env.py                       | 71 +++++++++++++++++++++++
 vllm/v1/engine/utils.py                   | 24 +++++---
 3 files changed, 93 insertions(+), 35 deletions(-)
 create mode 100644 vllm/ray/ray_env.py

diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 84e8ddd8e274d..6f11dcd19e9c0 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
-import json
 import os
 from collections import defaultdict
 from dataclasses import dataclass
@@ -20,6 +19,7 @@ from vllm.executor.ray_utils import (RayWorkerWrapper, initialize_ray_cluster,
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.platforms import current_platform
+from vllm.ray.ray_env import get_env_vars_to_copy
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
                         get_ip, get_open_port, make_async)
@@ -58,17 +58,6 @@ class RayDistributedExecutor(DistributedExecutorBase):
         "VLLM_HOST_IP", "VLLM_HOST_PORT", "LOCAL_RANK", "CUDA_VISIBLE_DEVICES"
     }
 
-    config_home = envs.VLLM_CONFIG_ROOT
-    # This file contains a list of env vars that should not be copied
-    # from the driver to the Ray workers.
-    non_carry_over_env_vars_file = os.path.join(
-        config_home, "ray_non_carry_over_env_vars.json")
-    if os.path.exists(non_carry_over_env_vars_file):
-        with open(non_carry_over_env_vars_file) as f:
-            non_carry_over_env_vars = set(json.load(f))
-    else:
-        non_carry_over_env_vars = set()
-
     uses_ray: bool = True
 
     def _init_executor(self) -> None:
@@ -335,13 +324,10 @@ class RayDistributedExecutor(DistributedExecutorBase):
         } for (node_id, _) in worker_node_and_gpu_ids]
 
         # Environment variables to copy from driver to workers
-        env_vars_to_copy = [
-            v for v in envs.environment_variables
-            if v not in self.WORKER_SPECIFIC_ENV_VARS
-            and v not in self.non_carry_over_env_vars
-        ]
-
-        env_vars_to_copy.extend(current_platform.additional_env_vars)
+        env_vars_to_copy = get_env_vars_to_copy(
+            exclude_vars=self.WORKER_SPECIFIC_ENV_VARS,
+            additional_vars=set(current_platform.additional_env_vars),
+            destination="workers")
 
         # Copy existing env vars to each worker's args
         for args in all_args_to_update_environment_variables:
@@ -350,15 +336,6 @@ class RayDistributedExecutor(DistributedExecutorBase):
                 if name in os.environ:
                     args[name] = os.environ[name]
 
-        logger.info("non_carry_over_env_vars from config: %s",
-                    self.non_carry_over_env_vars)
-        logger.info(
-            "Copying the following environment variables to workers: %s",
-            [v for v in env_vars_to_copy if v in os.environ])
-        logger.info(
-            "If certain env vars should NOT be copied to workers, add them to "
-            "%s file", self.non_carry_over_env_vars_file)
-
         self._env_vars_for_all_workers = (
             all_args_to_update_environment_variables)
 
diff --git a/vllm/ray/ray_env.py b/vllm/ray/ray_env.py
new file mode 100644
index 0000000000000..716d0bfafae54
--- /dev/null
+++ b/vllm/ray/ray_env.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+import os
+from typing import Optional
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+CONFIG_HOME = envs.VLLM_CONFIG_ROOT
+
+# This file contains a list of env vars that should not be copied
+# from the driver to the Ray workers.
+RAY_NON_CARRY_OVER_ENV_VARS_FILE = os.path.join(
+    CONFIG_HOME, "ray_non_carry_over_env_vars.json")
+
+try:
+    if os.path.exists(RAY_NON_CARRY_OVER_ENV_VARS_FILE):
+        with open(RAY_NON_CARRY_OVER_ENV_VARS_FILE) as f:
+            RAY_NON_CARRY_OVER_ENV_VARS = set(json.load(f))
+    else:
+        RAY_NON_CARRY_OVER_ENV_VARS = set()
+except json.JSONDecodeError:
+    logger.warning(
+        "Failed to parse %s. Using an empty set for non-carry-over env vars.",
+        RAY_NON_CARRY_OVER_ENV_VARS_FILE)
+    RAY_NON_CARRY_OVER_ENV_VARS = set()
+
+
+def get_env_vars_to_copy(exclude_vars: Optional[set[str]] = None,
+                         additional_vars: Optional[set[str]] = None,
+                         destination: Optional[str] = None) -> set[str]:
+    """
+    Get the environment variables to copy to downstream Ray actors.
+
+    Example use cases:
+    - Copy environment variables from RayDistributedExecutor to Ray workers.
+    - Copy environment variables from RayDPClient to Ray DPEngineCoreActor.
+
+    Args:
+        exclude_vars: A set of vllm defined environment variables to exclude
+            from copying.
+        additional_vars: A set of additional environment variables to copy.
+        destination: The destination of the environment variables.
+    Returns:
+        A set of environment variables to copy.
+    """
+    exclude_vars = exclude_vars or set()
+    additional_vars = additional_vars or set()
+
+    env_vars_to_copy = {
+        v
+        for v in envs.environment_variables
+        if v not in exclude_vars and v not in RAY_NON_CARRY_OVER_ENV_VARS
+    }
+    env_vars_to_copy.update(additional_vars)
+
+    to_destination = " to " + destination if destination is not None else ""
+
+    logger.info("RAY_NON_CARRY_OVER_ENV_VARS from config: %s",
+                RAY_NON_CARRY_OVER_ENV_VARS)
+    logger.info("Copying the following environment variables%s: %s",
+                to_destination,
+                [v for v in env_vars_to_copy if v in os.environ])
+    logger.info(
+        "If certain env vars should NOT be copied, add them to "
+        "%s file", RAY_NON_CARRY_OVER_ENV_VARS_FILE)
+
+    return env_vars_to_copy
diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
index c4012419411ad..ae104bd6eb96c 100644
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
+import os
 import weakref
 from collections.abc import Iterator
 from dataclasses import dataclass
@@ -15,6 +16,7 @@ import zmq
 
 from vllm.config import CacheConfig, ParallelConfig, VllmConfig
 from vllm.logger import init_logger
+from vllm.ray.ray_env import get_env_vars_to_copy
 from vllm.utils import get_mp_context, get_open_zmq_ipc_path, zmq_socket_ctx
 from vllm.v1.engine.coordinator import DPCoordinator
 from vllm.v1.executor.abstract import Executor
@@ -164,6 +166,7 @@ class CoreEngineActorManager:
         import copy
 
         import ray
+        from ray.runtime_env import RuntimeEnv
         from ray.util.scheduling_strategies import (
             PlacementGroupSchedulingStrategy)
 
@@ -175,6 +178,12 @@ class CoreEngineActorManager:
         local_engine_count = \
             vllm_config.parallel_config.data_parallel_size_local
         world_size = vllm_config.parallel_config.world_size
+        env_vars_set = get_env_vars_to_copy(destination="DPEngineCoreActor")
+        env_vars_dict = {
+            name: os.environ[name]
+            for name in env_vars_set if name in os.environ
+        }
+        runtime_env = RuntimeEnv(env_vars=env_vars_dict)
 
         if ray.is_initialized():
             logger.info(
@@ -210,13 +219,14 @@ class CoreEngineActorManager:
                 scheduling_strategy=PlacementGroupSchedulingStrategy(
                     placement_group=pg,
                     placement_group_bundle_index=world_size,
-                )).remote(vllm_config=dp_vllm_config,
-                          executor_class=executor_class,
-                          log_stats=log_stats,
-                          local_client=local_client,
-                          addresses=addresses,
-                          dp_rank=index,
-                          local_dp_rank=local_index)
+                ),
+                runtime_env=runtime_env).remote(vllm_config=dp_vllm_config,
+                                                executor_class=executor_class,
+                                                log_stats=log_stats,
+                                                local_client=local_client,
+                                                addresses=addresses,
+                                                dp_rank=index,
+                                                local_dp_rank=local_index)
             if local_client:
                 self.local_engine_actors.append(actor)
             else:

From 22dd9c2730dc1124b9d0ac15fff223d0b8d9020b Mon Sep 17 00:00:00 2001
From: jvlunteren <161835099+jvlunteren@users.noreply.github.com>
Date: Mon, 7 Jul 2025 21:08:12 +0200
Subject: [PATCH 140/167] [Kernel] Optimize Prefill Attention in Unified Triton
 Attention Kernel (#20308)

Signed-off-by: Jan van Lunteren <jvl@zurich.ibm.com>
---
 vllm/attention/ops/triton_unified_attention.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py
index c65f09523a3c7..f9645f651351f 100644
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@@ -145,7 +145,19 @@ def kernel_unified_attention_2d(
                               mask=query_mask_1,
                               other=0.0)
 
-    num_blocks = cdiv_fn(seq_len, BLOCK_SIZE)
+    # compute the length of the longest sequence prefix spanned by any
+    # query token in the current q_block (q_block_local_idx)
+    max_seq_prefix_len = context_len + q_block_local_idx * BLOCK_Q + (
+        BLOCK_M - 1) // num_queries_per_kv + 1
+
+    # adjust for potential padding in the last q_block by considering the
+    # actual sequence length
+    max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len)
+
+    # calculate the number of tiles (blocks) that need to be processed to
+    # cover the longest sequence prefix (due to causal masking, blocks beyond
+    # this prefix can be skipped)
+    num_blocks = cdiv_fn(max_seq_prefix_len, BLOCK_SIZE)
 
     # iterate through tiles
     for j in range(0, num_blocks):

From e601efcb10ed982e15e522f7c29c8531677678ca Mon Sep 17 00:00:00 2001
From: Anton <drobyshev.antony@gmail.com>
Date: Mon, 7 Jul 2025 22:43:08 +0300
Subject: [PATCH 141/167] [Misc] Add fully interleaved support for multimodal
 'string' content format (#14047)

Signed-off-by: drobyshev.anton <drobyshev.anton@wb.ru>
Co-authored-by: drobyshev.anton <drobyshev.anton@wb.ru>
---
 tests/entrypoints/test_chat_utils.py | 352 ++++++++++++++++++++++++++-
 vllm/config.py                       |  14 +-
 vllm/engine/arg_utils.py             |   5 +
 vllm/entrypoints/chat_utils.py       | 150 +++++++++---
 4 files changed, 478 insertions(+), 43 deletions(-)

diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index e41ea686e9927..e321ca70001d3 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -2,11 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import warnings
-from typing import Optional
+from collections.abc import Mapping
+from typing import Literal, Optional
 
 import pytest
 
+from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
 from vllm.config import ModelConfig
 from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
                                          parse_chat_messages,
@@ -15,7 +18,8 @@ from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
                                          resolve_hf_chat_template)
 from vllm.entrypoints.llm import apply_hf_chat_template
 from vllm.multimodal import MultiModalDataDict
-from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.utils import (encode_audio_base64, encode_image_base64,
+                                   encode_video_base64)
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 
 from ..models.registry import HF_EXAMPLE_MODELS
@@ -28,6 +32,7 @@ ULTRAVOX_MODEL_ID = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
 QWEN2AUDIO_MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
 QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
 QWEN25VL_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
+QWEN25OMNI_MODEL_ID = "Qwen/Qwen2.5-Omni-7B"
 MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B"
 HERMES_MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B"
@@ -48,6 +53,21 @@ def phi3v_model_config():
                        })
 
 
+@pytest.fixture(scope="function")
+def phi3v_model_config_mm_interleaved():
+    return ModelConfig(PHI3V_MODEL_ID,
+                       task="generate",
+                       tokenizer=PHI3V_MODEL_ID,
+                       tokenizer_mode="auto",
+                       trust_remote_code=True,
+                       dtype="auto",
+                       seed=0,
+                       interleave_mm_strings=True,
+                       limit_mm_per_prompt={
+                           "image": 2,
+                       })
+
+
 @pytest.fixture(scope="module")
 def phi3v_tokenizer():
     return TokenizerGroup(
@@ -58,6 +78,32 @@ def phi3v_tokenizer():
     )
 
 
+@pytest.fixture(scope="function")
+def qwen25omni_model_config_mm_interleaved():
+    return ModelConfig(QWEN25OMNI_MODEL_ID,
+                       task="generate",
+                       tokenizer=QWEN25OMNI_MODEL_ID,
+                       tokenizer_mode="auto",
+                       dtype="auto",
+                       seed=0,
+                       interleave_mm_strings=True,
+                       limit_mm_per_prompt={
+                           "image": 2,
+                           "audio": 1,
+                           "video": 1,
+                       })
+
+
+@pytest.fixture(scope="module")
+def qwen25omni_tokenizer():
+    return TokenizerGroup(
+        tokenizer_id=QWEN25OMNI_MODEL_ID,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+    )
+
+
 @pytest.fixture(scope="module")
 def mllama_model_config():
     return ModelConfig(MLLAMA_MODEL_ID,
@@ -113,6 +159,20 @@ def image_url():
     return f"data:image/jpeg;base64,{base64}"
 
 
+@pytest.fixture(scope="module")
+def video_url():
+    video = VideoAsset('baby_reading', 1)
+    base64 = encode_video_base64(video.np_ndarrays)
+    return f"data:video/jpeg;base64,{base64}"
+
+
+@pytest.fixture(scope="module")
+def audio_url():
+    audio = AudioAsset('mary_had_lamb')
+    base64 = encode_audio_base64(*audio.audio_and_sample_rate)
+    return f"data:audio/ogg;base64,{base64}"
+
+
 def _assert_mm_data_is_image_input(
     mm_data: Optional[MultiModalDataDict],
     image_count: int,
@@ -126,6 +186,23 @@ def _assert_mm_data_is_image_input(
     assert isinstance(image_data, list) and len(image_data) == image_count
 
 
+ModalityType = Literal["image", "video", "audio"]
+MultiModalDataCounts = Mapping[ModalityType, int]
+
+
+def _assert_mm_data_inputs(
+    mm_data: Optional[MultiModalDataDict],
+    data_count: MultiModalDataCounts,
+) -> None:
+    assert mm_data is not None
+    assert set(data_count.keys()) == (set(mm_data.keys()))
+
+    for modality, n in data_count.items():
+        modality_data = mm_data.get(modality)
+        assert modality_data is not None
+        assert isinstance(modality_data, list) and len(modality_data) == n
+
+
 def test_parse_chat_messages_single_image(
     phi3v_model_config,
     phi3v_tokenizer,
@@ -637,6 +714,277 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
     _assert_mm_data_is_image_input(mm_data, 2)
 
 
+def test_parse_chat_messages_multiple_images_interleave(
+    phi3v_model_config_mm_interleaved,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [{
+                "type": "text",
+                "text": "I need you to compare this image"
+            }, {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "and this one"
+            }, {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "Do they have differences?"
+            }]
+        }],
+        phi3v_model_config_mm_interleaved,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
+        "Do they have differences?"
+    }]
+    _assert_mm_data_is_image_input(mm_data, 2)
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_multiple_images_interleave_async(
+    phi3v_model_config_mm_interleaved,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [{
+                "type": "text",
+                "text": "I need you to compare this image"
+            }, {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "and this one"
+            }, {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "Do they have differences?"
+            }]
+        }],
+        phi3v_model_config_mm_interleaved,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
+        "Do they have differences?"
+    }]
+    _assert_mm_data_is_image_input(await mm_data, 2)
+
+
+def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
+    phi3v_model_config_mm_interleaved,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's on this image?"
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": "Be accurate."
+                },
+            ]
+        }, {
+            "role": "assistant",
+            "content": "Some stuff."
+        }, {
+            "role":
+            "user",
+            "content": [{
+                "type": "text",
+                "text": "What's on this image?"
+            }, {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }]
+        }],
+        phi3v_model_config_mm_interleaved,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "What's on this image?\n<|image_1|>\nBe accurate."
+    }, {
+        "role": "assistant",
+        "content": "Some stuff."
+    }, {
+        "role": "user",
+        "content": "What's on this image?\n<|image_2|>"
+    }]
+    _assert_mm_data_is_image_input(mm_data, 2)
+
+
+def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
+        qwen25omni_model_config_mm_interleaved, qwen25omni_tokenizer,
+        image_url, video_url, audio_url):
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's on this image?"
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": "Now listen to this audio"
+                },
+                {
+                    "type": "audio_url",
+                    "audio_url": {
+                        "url": audio_url
+                    }
+                },
+            ]
+        }, {
+            "role": "assistant",
+            "content": "Some stuff."
+        }, {
+            "role":
+            "user",
+            "content": [{
+                "type": "text",
+                "text": "What's on this image?"
+            }, {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "And what's in the video?"
+            }, {
+                "type": "video_url",
+                "video_url": {
+                    "url": video_url
+                }
+            }]
+        }],
+        qwen25omni_model_config_mm_interleaved,
+        qwen25omni_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+        "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>"
+    }, {
+        "role": "assistant",
+        "content": "Some stuff."
+    }, {
+        "role":
+        "user",
+        "content":
+        "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+        "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>"
+    }]
+
+    _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
+
+
+def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
+    phi3v_model_config_mm_interleaved,
+    phi3v_tokenizer,
+    image_url,
+):
+    with pytest.raises(
+            ValueError,
+            match=r"Found more '<|image_1|>' placeholders in input prompt "
+            "than actual multimodal data items."):
+        parse_chat_messages(
+            [{
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    },
+                    {
+                        "type":
+                        "text",
+                        "text":
+                        "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
+                        "Do they have differences?"
+                    },
+                ]
+            }],
+            phi3v_model_config_mm_interleaved,
+            phi3v_tokenizer,
+            content_format="string",
+        )
+
+
 ### Mllama currently wraps images / texts as interleaved dictionaries
 def test_mllama_single_image(
     mllama_model_config,
diff --git a/vllm/config.py b/vllm/config.py
index b7ba434db9179..bac18e8175d39 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -346,6 +346,9 @@ class ModelConfig:
     limit_mm_per_prompt: dict[str, int] = field(default_factory=dict)
     """Maximum number of data items per modality per prompt. Only applicable
     for multimodal models."""
+    interleave_mm_strings: bool = False
+    """Enable fully interleaved support for multimodal prompts, while using 
+    --chat-template-content-format=string. Defaults to False."""
     media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
     """Additional args passed to process media inputs, keyed by modalities. 
     For example, to set num_frames for video, set 
@@ -702,7 +705,8 @@ class ModelConfig:
                 media_io_kwargs=self.media_io_kwargs,
                 mm_processor_kwargs=self.mm_processor_kwargs,
                 disable_mm_preprocessor_cache=self.
-                disable_mm_preprocessor_cache)
+                disable_mm_preprocessor_cache,
+                interleave_mm_strings=self.interleave_mm_strings)
 
         if self.limit_mm_per_prompt:
             raise ValueError("`limit_mm_per_prompt` is only supported for "
@@ -713,6 +717,9 @@ class ModelConfig:
         if self.disable_mm_preprocessor_cache:
             raise ValueError("`disable_mm_preprocessor_cache` is only "
                              "supported for multimodal models.")
+        if self.interleave_mm_strings:
+            raise ValueError("`interleave_mm_strings` is only "
+                             "supported for multimodal models.")
 
         return None
 
@@ -3126,6 +3133,11 @@ class MultiModalConfig:
     If `True`, disable caching of the processed multi-modal inputs.
     """
 
+    interleave_mm_strings: bool = False
+    """
+    Enable fully interleaved support for multimodal prompts.
+    """
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index cf94b6a642818..a497e3c8eeac6 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -370,6 +370,7 @@ class EngineArgs:
         get_field(TokenizerPoolConfig, "extra_config")
     limit_mm_per_prompt: dict[str, int] = \
         get_field(MultiModalConfig, "limit_per_prompt")
+    interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings
     media_io_kwargs: dict[str, dict[str,
                                     Any]] = get_field(MultiModalConfig,
                                                       "media_io_kwargs")
@@ -763,6 +764,9 @@ class EngineArgs:
         multimodal_group.add_argument(
             "--disable-mm-preprocessor-cache",
             **multimodal_kwargs["disable_mm_preprocessor_cache"])
+        multimodal_group.add_argument(
+            "--interleave-mm-strings",
+            **multimodal_kwargs["interleave_mm_strings"])
 
         # LoRA related configs
         lora_kwargs = get_kwargs(LoRAConfig)
@@ -981,6 +985,7 @@ class EngineArgs:
             enable_prompt_embeds=self.enable_prompt_embeds,
             served_model_name=self.served_model_name,
             limit_mm_per_prompt=self.limit_mm_per_prompt,
+            interleave_mm_strings=self.interleave_mm_strings,
             media_io_kwargs=self.media_io_kwargs,
             use_async_output_proc=not self.disable_async_output_proc,
             config_format=self.config_format,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 012ea1d75f44e..08e94ec0fa1e4 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -4,7 +4,7 @@
 import asyncio
 import json
 from abc import ABC, abstractmethod
-from collections import defaultdict, deque
+from collections import Counter, defaultdict, deque
 from collections.abc import Awaitable, Iterable
 from functools import cached_property, lru_cache, partial
 from pathlib import Path
@@ -52,6 +52,12 @@ from vllm.utils import deprecate_kwargs, random_uuid
 
 logger = init_logger(__name__)
 
+MODALITY_PLACEHOLDERS_MAP = {
+    "image": "<##IMAGE##>",
+    "audio": "<##AUDIO##>",
+    "video": "<##VIDEO##>",
+}
+
 
 class AudioURL(TypedDict, total=False):
     url: Required[str]
@@ -354,6 +360,7 @@ def resolve_mistral_chat_template(
             "so it will be ignored.")
     return None
 
+
 @deprecate_kwargs(
     "trust_remote_code",
     additional_message="Please use `model_config.trust_remote_code` instead.",
@@ -633,15 +640,22 @@ class BaseMultiModalContentParser(ABC):
     def __init__(self) -> None:
         super().__init__()
 
-        # multimodal placeholder_string : count
-        self._placeholder_counts: dict[str, int] = defaultdict(lambda: 0)
+        # stores model placehodlers list with corresponding
+        # general MM placeholder:
+        # {
+        #   "<##IMAGE##>": ["<image>", "<image>", "<image>"],
+        #   "<##AUDIO##>": ["<audio>", "<audio>"]
+        # }
+        self._placeholder_storage: dict[str, list] = defaultdict(list)
 
-    def _add_placeholder(self, placeholder: Optional[str]):
+    def _add_placeholder(self, modality: ModalityStr,
+                         placeholder: Optional[str]):
+        mod_placeholder = MODALITY_PLACEHOLDERS_MAP[modality]
         if placeholder:
-            self._placeholder_counts[placeholder] += 1
+            self._placeholder_storage[mod_placeholder].append(placeholder)
 
-    def mm_placeholder_counts(self) -> dict[str, int]:
-        return dict(self._placeholder_counts)
+    def mm_placeholder_storage(self) -> dict[str, list]:
+        return dict(self._placeholder_storage)
 
     @abstractmethod
     def parse_image(self, image_url: str) -> None:
@@ -685,7 +699,7 @@ class MultiModalContentParser(BaseMultiModalContentParser):
         image = self._connector.fetch_image(image_url)
 
         placeholder = self._tracker.add("image", image)
-        self._add_placeholder(placeholder)
+        self._add_placeholder("image", placeholder)
 
     def parse_image_embeds(self,
                            image_embeds: Union[str, dict[str, str]]) -> None:
@@ -700,17 +714,17 @@ class MultiModalContentParser(BaseMultiModalContentParser):
             embedding = self._connector.fetch_image_embedding(image_embeds)
             placeholder = self._tracker.add("image_embeds", embedding)
 
-        self._add_placeholder(placeholder)
+        self._add_placeholder("image", placeholder)
 
     def parse_image_pil(self, image_pil: Image.Image) -> None:
         placeholder = self._tracker.add("image", image_pil)
-        self._add_placeholder(placeholder)
+        self._add_placeholder("image", placeholder)
 
     def parse_audio(self, audio_url: str) -> None:
         audio = self._connector.fetch_audio(audio_url)
 
         placeholder = self._tracker.add("audio", audio)
-        self._add_placeholder(placeholder)
+        self._add_placeholder("audio", placeholder)
 
     def parse_input_audio(self, input_audio: InputAudio) -> None:
         audio_data = input_audio.get("data", "")
@@ -723,7 +737,7 @@ class MultiModalContentParser(BaseMultiModalContentParser):
         video = self._connector.fetch_video(video_url=video_url)
 
         placeholder = self._tracker.add("video", video)
-        self._add_placeholder(placeholder)
+        self._add_placeholder("video", placeholder)
 
 
 class AsyncMultiModalContentParser(BaseMultiModalContentParser):
@@ -741,7 +755,7 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
         image_coro = self._connector.fetch_image_async(image_url)
 
         placeholder = self._tracker.add("image", image_coro)
-        self._add_placeholder(placeholder)
+        self._add_placeholder("image", placeholder)
 
     def parse_image_embeds(self,
                            image_embeds: Union[str, dict[str, str]]) -> None:
@@ -760,20 +774,20 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
             future.set_result(embedding)
 
         placeholder = self._tracker.add("image_embeds", future)
-        self._add_placeholder(placeholder)
+        self._add_placeholder("image", placeholder)
 
     def parse_image_pil(self, image_pil: Image.Image) -> None:
         future: asyncio.Future[Image.Image] = asyncio.Future()
         future.set_result(image_pil)
 
         placeholder = self._tracker.add("image", future)
-        self._add_placeholder(placeholder)
+        self._add_placeholder("image", placeholder)
 
     def parse_audio(self, audio_url: str) -> None:
         audio_coro = self._connector.fetch_audio_async(audio_url)
 
         placeholder = self._tracker.add("audio", audio_coro)
-        self._add_placeholder(placeholder)
+        self._add_placeholder("audio", placeholder)
 
     def parse_input_audio(self, input_audio: InputAudio) -> None:
         audio_data = input_audio.get("data", "")
@@ -786,7 +800,7 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
         video = self._connector.fetch_video_async(video_url=video_url)
 
         placeholder = self._tracker.add("video", video)
-        self._add_placeholder(placeholder)
+        self._add_placeholder("video", placeholder)
 
 
 def validate_chat_template(chat_template: Optional[Union[Path, str]]):
@@ -856,12 +870,40 @@ def load_chat_template(
     return _cached_load_chat_template(chat_template, is_literal=is_literal)
 
 
+def _get_interleaved_text_prompt(placeholder_storage: dict[str, list],
+                                 texts: list[str]) -> str:
+    for idx, elem in enumerate(texts):
+        if elem in placeholder_storage:
+            texts[idx] = placeholder_storage[elem].pop(0)
+
+    return "\n".join(texts)
+
+
 # TODO: Let user specify how to insert multimodal tokens into prompt
 # (similar to chat template)
-def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
-                                     text_prompt: str) -> str:
+def _get_full_multimodal_text_prompt(placeholder_storage: dict[str, list],
+                                     texts: list[str],
+                                     interleave_strings: bool
+                                     ) -> str:
     """Combine multimodal prompts for a multimodal language model."""
 
+    # flatten storage to make it looks like
+    # {
+    #   "<|image|>": 2,
+    #   "<|audio|>": 1
+    # }
+    placeholder_counts = Counter(
+        [v for elem in placeholder_storage.values() for v in elem]
+    )
+
+    if interleave_strings:
+        text_prompt = _get_interleaved_text_prompt(placeholder_storage, texts)
+    else:
+        text_prompt = "\n".join(texts)
+
+    # Pass interleaved text further in case the user used image placeholders
+    # himself, but forgot to disable the 'interleave_strings' flag
+
     # Look through the text prompt to check for missing placeholders
     missing_placeholders: list[str] = []
     for placeholder in placeholder_counts:
@@ -870,6 +912,13 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
         placeholder_counts[placeholder] -= text_prompt.count(placeholder)
 
         if placeholder_counts[placeholder] < 0:
+            logger.error(
+                "Placeholder count is negative! "
+                "Ensure that the 'interleave_strings' flag is disabled "
+                "(current value: %s) "
+                "when manually placing image placeholders.", interleave_strings
+            )
+            logger.debug("Input prompt: %s", text_prompt)
             raise ValueError(
                 f"Found more '{placeholder}' placeholders in input prompt than "
                 "actual multimodal data items.")
@@ -877,8 +926,8 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
         missing_placeholders.extend([placeholder] *
                                     placeholder_counts[placeholder])
 
-    # NOTE: For now we always add missing placeholders at the front of
-    # the prompt. This may change to be customizable in the future.
+    # NOTE: Default behaviour: we always add missing placeholders
+    # at the front of the prompt, if interleave_strings=False
     return "\n".join(missing_placeholders + [text_prompt])
 
 
@@ -988,6 +1037,7 @@ def _parse_chat_message_content_parts(
     mm_tracker: BaseMultiModalItemTracker,
     *,
     wrap_dicts: bool,
+    interleave_strings: bool,
 ) -> list[ConversationMessage]:
     content = list[_ContentPart]()
 
@@ -998,6 +1048,7 @@ def _parse_chat_message_content_parts(
             part,
             mm_parser,
             wrap_dicts=wrap_dicts,
+            interleave_strings=interleave_strings
         )
         if parse_res:
             content.append(parse_res)
@@ -1007,11 +1058,14 @@ def _parse_chat_message_content_parts(
         return [ConversationMessage(role=role,
                                     content=content)]  # type: ignore
     texts = cast(list[str], content)
-    text_prompt = "\n".join(texts)
-    mm_placeholder_counts = mm_parser.mm_placeholder_counts()
-    if mm_placeholder_counts:
-        text_prompt = _get_full_multimodal_text_prompt(mm_placeholder_counts,
-                                                       text_prompt)
+    mm_placeholder_storage = mm_parser.mm_placeholder_storage()
+    if mm_placeholder_storage:
+        text_prompt = _get_full_multimodal_text_prompt(mm_placeholder_storage,
+                                                       texts,
+                                                       interleave_strings)
+    else:
+        text_prompt = "\n".join(texts)
+
     return [ConversationMessage(role=role, content=text_prompt)]
 
 
@@ -1020,6 +1074,7 @@ def _parse_chat_message_content_part(
     mm_parser: BaseMultiModalContentParser,
     *,
     wrap_dicts: bool,
+    interleave_strings: bool,
 ) -> Optional[_ContentPart]:
     """Parses a single part of a conversation. If wrap_dicts is True,
     structured dictionary pieces for texts and images will be
@@ -1049,34 +1104,37 @@ def _parse_chat_message_content_part(
         else:
             return str_content
 
+    modality = None
     if part_type == "image_pil":
         image_content = cast(Image.Image, content)
         mm_parser.parse_image_pil(image_content)
-        return {'type': 'image'} if wrap_dicts else None
-    if part_type == "image_url":
+        modality = "image"
+    elif part_type == "image_url":
         str_content = cast(str, content)
         mm_parser.parse_image(str_content)
-        return {'type': 'image'} if wrap_dicts else None
-    if part_type == "image_embeds":
+        modality = "image"
+    elif part_type == "image_embeds":
         content = cast(Union[str, dict[str, str]], content)
         mm_parser.parse_image_embeds(content)
-        return {'type': 'image'} if wrap_dicts else None
-    if part_type == "audio_url":
+        modality = "image"
+    elif part_type == "audio_url":
         str_content = cast(str, content)
         mm_parser.parse_audio(str_content)
-        return {'type': 'audio'} if wrap_dicts else None
-
-    if part_type == "input_audio":
+        modality = "audio"
+    elif part_type == "input_audio":
         dict_content = cast(InputAudio, content)
         mm_parser.parse_input_audio(dict_content)
-        return {'type': 'audio'} if wrap_dicts else None
-
-    if part_type == "video_url":
+        modality = "audio"
+    elif part_type == "video_url":
         str_content = cast(str, content)
         mm_parser.parse_video(str_content)
-        return {'type': 'video'} if wrap_dicts else None
+        modality = "video"
+    else:
+        raise NotImplementedError(f"Unknown part type: {part_type}")
 
-    raise NotImplementedError(f"Unknown part type: {part_type}")
+    return {'type': modality} if wrap_dicts else (
+        MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None
+    )
 
 
 # No need to validate using Pydantic again
@@ -1088,6 +1146,7 @@ def _parse_chat_message_content(
     message: ChatCompletionMessageParam,
     mm_tracker: BaseMultiModalItemTracker,
     content_format: _ChatTemplateContentFormat,
+    interleave_strings: bool,
 ) -> list[ConversationMessage]:
     role = message["role"]
     content = message.get("content")
@@ -1103,6 +1162,7 @@ def _parse_chat_message_content(
         content,  # type: ignore
         mm_tracker,
         wrap_dicts=(content_format == "openai"),
+        interleave_strings=interleave_strings,
     )
 
     for result_msg in result:
@@ -1155,6 +1215,11 @@ def parse_chat_messages(
             msg,
             mm_tracker,
             content_format,
+            interleave_strings=(
+                content_format == "string"
+                and model_config.multimodal_config is not None
+                and model_config.multimodal_config.interleave_mm_strings
+            )
         )
 
         conversation.extend(sub_messages)
@@ -1178,6 +1243,11 @@ def parse_chat_messages_futures(
             msg,
             mm_tracker,
             content_format,
+            interleave_strings=(
+                content_format == "string"
+                and model_config.multimodal_config is not None
+                and model_config.multimodal_config.interleave_mm_strings
+            )
         )
 
         conversation.extend(sub_messages)

From 8e807cdfa4e471f73294aa234aa4503d8700cbd5 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Tue, 8 Jul 2025 04:45:10 +0800
Subject: [PATCH 142/167] [Misc] feat output content in stream response
 (#19608)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 vllm/entrypoints/openai/api_server.py | 150 +++++++++++++++++++++++++-
 1 file changed, 148 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index e3285a9bf76d1..2f8b31c8a7ba7 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -21,6 +21,7 @@ from http import HTTPStatus
 from typing import Annotated, Any, Optional
 
 import prometheus_client
+import pydantic
 import regex as re
 import uvloop
 from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request
@@ -1203,6 +1204,142 @@ class XRequestIdMiddleware:
         return self.app(scope, receive, send_with_request_id)
 
 
+def _extract_content_from_chunk(chunk_data: dict) -> str:
+    """Extract content from a streaming response chunk."""
+    try:
+        from vllm.entrypoints.openai.protocol import (
+            ChatCompletionStreamResponse, CompletionStreamResponse)
+
+        # Try using Completion types for type-safe parsing
+        if chunk_data.get('object') == 'chat.completion.chunk':
+            chat_response = ChatCompletionStreamResponse.model_validate(
+                chunk_data)
+            if chat_response.choices and chat_response.choices[0].delta.content:
+                return chat_response.choices[0].delta.content
+        elif chunk_data.get('object') == 'text_completion':
+            completion_response = CompletionStreamResponse.model_validate(
+                chunk_data)
+            if completion_response.choices and completion_response.choices[
+                    0].text:
+                return completion_response.choices[0].text
+    except pydantic.ValidationError:
+        # Fallback to manual parsing
+        if 'choices' in chunk_data and chunk_data['choices']:
+            choice = chunk_data['choices'][0]
+            if 'delta' in choice and choice['delta'].get('content'):
+                return choice['delta']['content']
+            elif choice.get('text'):
+                return choice['text']
+    return ""
+
+
+class SSEDecoder:
+    """Robust Server-Sent Events decoder for streaming responses."""
+
+    def __init__(self):
+        self.buffer = ""
+        self.content_buffer = []
+
+    def decode_chunk(self, chunk: bytes) -> list[dict]:
+        """Decode a chunk of SSE data and return parsed events."""
+        import json
+
+        try:
+            chunk_str = chunk.decode('utf-8')
+        except UnicodeDecodeError:
+            # Skip malformed chunks
+            return []
+
+        self.buffer += chunk_str
+        events = []
+
+        # Process complete lines
+        while '\n' in self.buffer:
+            line, self.buffer = self.buffer.split('\n', 1)
+            line = line.rstrip('\r')  # Handle CRLF
+
+            if line.startswith('data: '):
+                data_str = line[6:].strip()
+                if data_str == '[DONE]':
+                    events.append({'type': 'done'})
+                elif data_str:
+                    try:
+                        event_data = json.loads(data_str)
+                        events.append({'type': 'data', 'data': event_data})
+                    except json.JSONDecodeError:
+                        # Skip malformed JSON
+                        continue
+
+        return events
+
+    def extract_content(self, event_data: dict) -> str:
+        """Extract content from event data."""
+        return _extract_content_from_chunk(event_data)
+
+    def add_content(self, content: str) -> None:
+        """Add content to the buffer."""
+        if content:
+            self.content_buffer.append(content)
+
+    def get_complete_content(self) -> str:
+        """Get the complete buffered content."""
+        return ''.join(self.content_buffer)
+
+
+def _log_streaming_response(response, response_body: list) -> None:
+    """Log streaming response with robust SSE parsing."""
+    from starlette.concurrency import iterate_in_threadpool
+
+    sse_decoder = SSEDecoder()
+    chunk_count = 0
+
+    def buffered_iterator():
+        nonlocal chunk_count
+
+        for chunk in response_body:
+            chunk_count += 1
+            yield chunk
+
+            # Parse SSE events from chunk
+            events = sse_decoder.decode_chunk(chunk)
+
+            for event in events:
+                if event['type'] == 'data':
+                    content = sse_decoder.extract_content(event['data'])
+                    sse_decoder.add_content(content)
+                elif event['type'] == 'done':
+                    # Log complete content when done
+                    full_content = sse_decoder.get_complete_content()
+                    if full_content:
+                        # Truncate if too long
+                        if len(full_content) > 2048:
+                            full_content = full_content[:2048] + ""
+                            "...[truncated]"
+                        logger.info(
+                            "response_body={streaming_complete: " \
+                            "content='%s', chunks=%d}",
+                            full_content, chunk_count)
+                    else:
+                        logger.info(
+                            "response_body={streaming_complete: " \
+                            "no_content, chunks=%d}",
+                            chunk_count)
+                    return
+
+    response.body_iterator = iterate_in_threadpool(buffered_iterator())
+    logger.info("response_body={streaming_started: chunks=%d}",
+                len(response_body))
+
+
+def _log_non_streaming_response(response_body: list) -> None:
+    """Log non-streaming response."""
+    try:
+        decoded_body = response_body[0].decode()
+        logger.info("response_body={%s}", decoded_body)
+    except UnicodeDecodeError:
+        logger.info("response_body={<binary_data>}")
+
+
 def build_app(args: Namespace) -> FastAPI:
     if args.disable_fastapi_docs:
         app = FastAPI(openapi_url=None,
@@ -1267,8 +1404,17 @@ def build_app(args: Namespace) -> FastAPI:
                 section async for section in response.body_iterator
             ]
             response.body_iterator = iterate_in_threadpool(iter(response_body))
-            logger.info("response_body={%s}",
-                        response_body[0].decode() if response_body else None)
+            # Check if this is a streaming response by looking at content-type
+            content_type = response.headers.get("content-type", "")
+            is_streaming = content_type == "text/event-stream; charset=utf-8"
+
+            # Log response body based on type
+            if not response_body:
+                logger.info("response_body={<empty>}")
+            elif is_streaming:
+                _log_streaming_response(response, response_body)
+            else:
+                _log_non_streaming_response(response_body)
             return response
 
     for middleware in args.middleware:

From 042d131f394d9069ea9b472a2f5adad64b6e2df1 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 7 Jul 2025 22:13:52 +0100
Subject: [PATCH 143/167] Fix links in multi-modal model contributing page
 (#18615)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/contributing/model/multimodal.md    |  2 +-
 vllm/model_executor/models/interfaces.py | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md
index ed1cd46dd8585..64daa9c2d4cdd 100644
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@@ -819,7 +819,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
 After you have defined [BaseProcessingInfo][vllm.multimodal.processing.BaseProcessingInfo] (Step 2),
 [BaseDummyInputsBuilder][vllm.multimodal.profiling.BaseDummyInputsBuilder] (Step 3),
 and [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] (Step 4),
-decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_processor <vllm.multimodal.registry.MultiModalRegistry.register_processor>`
+decorate the model class with [MULTIMODAL_REGISTRY.register_processor][vllm.multimodal.processing.MultiModalRegistry.register_processor]
 to register them to the multi-modal registry:
 
 ```diff
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index a018bd5d09d9a..3863d8454bffb 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -89,11 +89,22 @@ class SupportsMultiModal(Protocol):
     ) -> Tensor:
         ...
 
+    # TODO: Remove this overload once v0 is deprecated
     @overload
     def get_input_embeddings(
         self,
         input_ids: Tensor,
         multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> Tensor:
+        ...
+
+    def get_input_embeddings(
+        self,
+        input_ids: Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        # Only necessary so that the v0 overload is valid
+        # TODO: Remove attn_metadata once v0 is deprecated
+        attn_metadata: Optional["AttentionMetadata"] = None,
     ) -> Tensor:
         """
         Returns the input embeddings merged from the text embeddings from 

From 14601f5fba13a79d614b08e418e435ef228152a7 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 8 Jul 2025 00:25:10 +0200
Subject: [PATCH 144/167] [Config] Refactor mistral configs  (#20570)

Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 vllm/model_executor/models/llama.py        |   3 +
 vllm/transformers_utils/config.py          | 157 ++++++---------------
 vllm/transformers_utils/configs/mistral.py | 120 ++++++++++++++++
 3 files changed, 167 insertions(+), 113 deletions(-)
 create mode 100644 vllm/transformers_utils/configs/mistral.py

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 5d5080479e510..48ec611df12dd 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -491,6 +491,9 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "qscale_act": "input_scale",
         "qscale_weight": "weight_scale",
         "kv_fake_quantizer.qscale_act": "kv_scale",
+        "q_fake_quantizer.qscale_act": "attn.q_scale",
+        "k_fake_quantizer.qscale_act": "k_scale",
+        "v_fake_quantizer.qscale_act": "v_scale",
         "wq": "q_proj",
         "wk": "k_proj",
         "wv": "v_proj",
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 9ccde292974cf..411c970b2f0d8 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -7,7 +7,7 @@ import os
 import time
 from functools import cache, partial
 from pathlib import Path
-from typing import Any, Callable, Literal, Optional, TypeVar, Union
+from typing import Any, Callable, Optional, TypeVar, Union
 
 import huggingface_hub
 from huggingface_hub import get_safetensors_metadata, hf_hub_download
@@ -42,6 +42,7 @@ from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
                                              SkyworkR1VChatConfig, SolarConfig,
                                              Telechat2Config, UltravoxConfig)
 # yapf: enable
+from vllm.transformers_utils.configs.mistral import adapt_config_dict
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import resolve_obj_by_qualname
 
@@ -394,7 +395,16 @@ def get_config(
         config = _maybe_remap_hf_config_attrs(config)
 
     elif config_format == ConfigFormat.MISTRAL:
-        config = load_params_config(model, revision, **kwargs)
+        # This function loads a params.json config which
+        # should be used when loading models in mistral format
+        config_dict = _download_mistral_config_file(model, revision)
+        if (max_position_embeddings :=
+                config_dict.get("max_position_embeddings")) is None:
+            max_position_embeddings = _maybe_retrieve_max_pos_from_hf(
+                model, revision, **kwargs)
+            config_dict["max_position_embeddings"] = max_position_embeddings
+
+        config = adapt_config_dict(config_dict)
     else:
         supported_formats = [
             fmt.value for fmt in ConfigFormat if fmt != ConfigFormat.AUTO
@@ -693,117 +703,6 @@ def maybe_register_config_serialize_by_value() -> None:
             exc_info=e)
 
 
-def load_params_config(model: Union[str, Path], revision: Optional[str],
-                       **kwargs) -> PretrainedConfig:
-    # This function loads a params.json config which
-    # should be used when loading models in mistral format
-
-    config_file_name = "params.json"
-
-    config_dict = get_hf_file_to_dict(config_file_name, model, revision)
-    if config_dict is None:
-        raise ValueError(
-            f"Failed to load mistral '{config_file_name}' config for model "
-            f"{model}. Please check if the model is a mistral-format model "
-            f"and if the config file exists.")
-    assert isinstance(config_dict, dict)
-
-    config_mapping = {
-        "dim": "hidden_size",
-        "norm_eps": "rms_norm_eps",
-        "n_kv_heads": "num_key_value_heads",
-        "n_layers": "num_hidden_layers",
-        "n_heads": "num_attention_heads",
-        "hidden_dim": "intermediate_size",
-    }
-
-    def recurse_elems(elem: Any):
-        if isinstance(elem, dict):
-            config_dict = {}
-            for key, value in elem.items():
-                key = config_mapping.get(key, key)
-                config_dict[key] = recurse_elems(value)
-
-            return config_dict
-        else:
-            return elem
-
-    config_dict["model_type"] = config_dict.get("model_type", "transformer")
-    config_dict["hidden_act"] = config_dict.get("activation", "silu")
-    config_dict["tie_word_embeddings"] = config_dict.get(
-        "tie_embeddings", False)
-
-    if config_dict.get("max_position_embeddings") is None:
-        max_position_embeddings = 128_000
-        try:
-            trust_remote_code_val = kwargs.get("trust_remote_code", False)
-            hf_config = get_config(model=model,
-                                   trust_remote_code=trust_remote_code_val,
-                                   revision=revision,
-                                   config_format=ConfigFormat.HF)
-            if hf_value := hf_config.get_text_config().max_position_embeddings:
-                max_position_embeddings = hf_value
-        except Exception as e:
-            logger.warning(
-                "The params.json file is missing 'max_position_embeddings'"
-                " and could not get a value from the HF config."
-                " Defaulting to 128000",
-                exc_info=e)
-        config_dict["max_position_embeddings"] = max_position_embeddings
-
-    if config_dict.get("quantization") is not None:
-        quantization = config_dict.get("quantization", {})
-        if quantization.get("qformat_weight") == "fp8_e4m3":
-            # This maps to the FP8 static per-tensor quantization scheme
-            quantization_config = {
-                "quant_method": "fp8",
-                "activation_scheme": "static"
-            }
-        elif quantization.get("quant_method") == "compressed-tensors":
-            # Pass through the quantization config to compressed-tensors
-            quantization_config = quantization
-        else:
-            raise ValueError(
-                f"Found unknown quantization='{quantization}' in config")
-
-        config_dict["quantization_config"] = quantization_config
-
-    config_type: Literal["text",
-                         "multimodal"] = "multimodal" if config_dict.get(
-                             "vision_encoder") is not None else "text"
-
-    if config_dict.get("moe") is not None:
-        config_dict["architectures"] = ["MixtralForCausalLM"]
-    else:
-        config_dict["architectures"] = ["MistralForCausalLM"]
-
-    if config_type == "multimodal":
-        multimodal_config = config_dict.pop("vision_encoder")
-        quantization_config = config_dict.get("quantization_config", {})
-
-        config_dict = {
-            "text_config": config_dict,
-            "vision_config": multimodal_config
-        }
-        config_dict["architectures"] = ["PixtralForConditionalGeneration"]
-        config_dict["model_type"] = "pixtral"
-        if quantization_config:
-            config_dict["quantization_config"] = quantization_config
-
-    config_dict.update(kwargs)
-
-    config_dict = recurse_elems(config_dict)
-
-    # transform to HF config format
-    if config_type == "multimodal":
-        config_dict["text_config"] = PretrainedConfig(
-            **config_dict["text_config"])
-        config_dict["vision_config"] = PretrainedConfig(
-            **config_dict["vision_config"])
-
-    return PretrainedConfig(**config_dict)
-
-
 def get_hf_image_processor_config(
     model: Union[str, Path],
     hf_token: Optional[Union[bool, str]] = None,
@@ -920,3 +819,35 @@ def try_get_tokenizer_config(
         )
     except Exception:
         return None
+
+
+def _download_mistral_config_file(model, revision) -> dict:
+    config_file_name = "params.json"
+    config_dict = get_hf_file_to_dict(config_file_name, model, revision)
+    if config_dict is None:
+        raise ValueError(
+            f"Failed to load mistral '{config_file_name}' config for model "
+            f"{model}. Please check if the model is a mistral-format model "
+            f"and if the config file exists.")
+    assert isinstance(config_dict, dict)
+    return config_dict
+
+
+def _maybe_retrieve_max_pos_from_hf(model, revision, **kwargs) -> int:
+    max_position_embeddings = 128_000
+    try:
+        trust_remote_code_val = kwargs.get("trust_remote_code", False)
+        hf_config = get_config(model=model,
+                               trust_remote_code=trust_remote_code_val,
+                               revision=revision,
+                               config_format=ConfigFormat.HF)
+        if hf_value := hf_config.get_text_config().max_position_embeddings:
+            max_position_embeddings = hf_value
+    except Exception as e:
+        logger.warning(
+            "The params.json file is missing 'max_position_embeddings'"
+            " and could not get a value from the HF config."
+            " Defaulting to 128000",
+            exc_info=e)
+
+    return max_position_embeddings
diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py
new file mode 100644
index 0000000000000..d2059c55a3073
--- /dev/null
+++ b/vllm/transformers_utils/configs/mistral.py
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+from transformers import PretrainedConfig
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def adapt_config_dict(config_dict: dict[str, Any],
+                      **kwargs) -> PretrainedConfig:
+    config_dict.update(kwargs)
+    config_dict = _remap_general_mistral_args(config_dict)
+
+    if bool(config_dict.get("quantization")):
+        config_dict = _remap_mistral_quantization_args(config_dict)
+
+    if bool(config_dict.get("moe")):
+        config_dict["architectures"] = ["MixtralForCausalLM"]
+    else:
+        config_dict["architectures"] = ["MistralForCausalLM"]
+
+    if bool(config_dict.get("yarn")):
+        config_dict = _remap_mistral_yarn_args(config_dict)
+    if bool((config_dict.get("multimodal") or {}).get("vision_encoder_args")
+            or config_dict.get("vision_encoder")):
+        config_dict = _remap_mistral_vision_args(config_dict)
+
+    config = PretrainedConfig.from_dict(config_dict)
+
+    logger.debug("Initialized config", config)
+
+    return config
+
+
+def _remap_mistral_vision_args(config: dict) -> dict:
+    if config.get("multimodal"):
+        vision_config = config.pop("multimodal")
+    else:
+        vision_config = config.pop("vision_encoder")
+
+    quant_config = config.get("quantization_config")
+    config = {
+        "model_type": "pixtral",
+        "architectures": ["PixtralForConditionalGeneration"],
+        "text_config": PretrainedConfig.from_dict(config),
+        "vision_config": PretrainedConfig.from_dict(vision_config),
+    }
+    if quant_config:
+        config["quantization_config"] = quant_config
+    return config
+
+
+def _remap_mistral_yarn_args(config: dict) -> dict:
+    # Direct remaps: yarn.X -> rope_scaling.Y
+    # Source keys are from mistral.model.args.YarnArgs
+    _map = {
+        "beta": "beta_fast",
+        "alpha": "beta_slow",
+    }
+    yarn_config = config.get("yarn") or {}
+    renamed_yarn_config = {_map.get(k, k): v for k, v in yarn_config.items()}
+    config["rope_scaling"] = {
+        "rope_type": "yarn",
+        "mscale_all_dim": 1,  # We hardcoded this to 1
+        **renamed_yarn_config
+    }
+    return config
+
+
+def _remap_general_mistral_args(config: dict) -> dict:
+    # Mistral key -> HF key
+    config_mapping = {
+        "dim": "hidden_size",
+        "norm_eps": "rms_norm_eps",
+        "n_kv_heads": "num_key_value_heads",
+        "n_layers": "num_hidden_layers",
+        "n_heads": "num_attention_heads",
+        "hidden_dim": "intermediate_size",
+    }
+    # HF key -> (Mistral key, default value)
+    top_level_mapping_with_default = {
+        "model_type": ("model_type", "transformer"),
+        "hidden_act": ("activation", "silu"),
+        "tie_word_embeddings": ("tied_embeddings", False),
+        "max_seq_len": ("max_seq_len", 128_000),
+        "max_position_embeddings": ("max_position_embeddings", 128_000),
+    }
+
+    for key, new_key in config_mapping.items():
+        if key in config:
+            config[new_key] = config.pop(key)
+
+    for new_key, (key,
+                  default_value) in top_level_mapping_with_default.items():
+        config[new_key] = config.pop(key, default_value)
+
+    return config
+
+
+def _remap_mistral_quantization_args(config: dict) -> dict:
+    quantization = config.get("quantization", {})
+    if quantization.get("qformat_weight") == "fp8_e4m3":
+        # This maps to the FP8 static per-tensor quantization scheme
+        quantization_config = {
+            "quant_method": "fp8",
+            "activation_scheme": "static"
+        }
+    elif quantization.get("quant_method") == "compressed-tensors":
+        # Pass through the quantization config to compressed-tensors
+        quantization_config = quantization
+    else:
+        raise ValueError(
+            f"Found unknown quantization='{quantization}' in config")
+
+    config["quantization_config"] = quantization_config
+
+    return config

From d2e841a10a273df06c8e57ba5ca8eb9cd8cce79e Mon Sep 17 00:00:00 2001
From: Kyle Yu <153807854+kyolebu@users.noreply.github.com>
Date: Mon, 7 Jul 2025 20:48:09 -0400
Subject: [PATCH 145/167] [Misc] Improve logging for dynamic shape cache
 compilation (#20573)

Signed-off-by: kyolebu <kyu@redhat.com>
---
 vllm/compilation/backends.py | 51 +++++++++++++++++++++++++-----------
 1 file changed, 36 insertions(+), 15 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index a2bb053cec4a1..5148c289d8656 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -120,10 +120,15 @@ class CompilerManager:
         handle = self.cache[(runtime_shape, graph_index, self.compiler.name)]
         compiled_graph = self.compiler.load(handle, graph, example_inputs,
                                             graph_index, runtime_shape)
-        logger.debug(
-            "Directly load the %s-th graph for shape %s from %s via "
-            "handle %s", graph_index, str(runtime_shape), self.compiler.name,
-            handle)
+        if runtime_shape is None:
+            logger.debug(
+                "Directly load the %s-th graph for dynamic shape from %s via "
+                "handle %s", graph_index, self.compiler.name, handle)
+        else:
+            logger.debug(
+                "Directly load the %s-th graph for shape %s from %s via "
+                "handle %s", graph_index, str(runtime_shape),
+                self.compiler.name, handle)
         return compiled_graph
 
     def compile(self,
@@ -152,9 +157,15 @@ class CompilerManager:
                 # there can be multiple graphs due to piecewise compilation.
                 now = time.time()
                 elapsed = now - compilation_start_time
-                logger.info(
-                    "Directly load the compiled graph(s) for shape %s "
-                    "from the cache, took %.3f s", str(runtime_shape), elapsed)
+                if runtime_shape is None:
+                    logger.info(
+                        "Directly load the compiled graph(s) for dynamic shape "
+                        "from the cache, took %.3f s", elapsed)
+                else:
+                    logger.info(
+                        "Directly load the compiled graph(s) for shape %s "
+                        "from the cache, took %.3f s", str(runtime_shape),
+                        elapsed)
             return compiled_graph
 
         # no compiler cached the graph, or the cache is disabled,
@@ -178,11 +189,21 @@ class CompilerManager:
             self.is_cache_updated = True
             if graph_index == 0:
                 # adds some info logging for the first graph
-                logger.info("Cache the graph of shape %s for later use",
-                            str(runtime_shape))
-            logger.debug(
-                "store the %s-th graph for shape %s from %s via handle %s",
-                graph_index, str(runtime_shape), self.compiler.name, handle)
+                if runtime_shape is None:
+                    logger.info(
+                        "Cache the graph for dynamic shape for later use")
+                else:
+                    logger.info("Cache the graph of shape %s for later use",
+                                str(runtime_shape))
+            if runtime_shape is None:
+                logger.debug(
+                    "Store the %s-th graph for dynamic shape from %s via "
+                    "handle %s", graph_index, self.compiler.name, handle)
+            else:
+                logger.debug(
+                    "Store the %s-th graph for shape %s from %s via handle %s",
+                    graph_index, str(runtime_shape), self.compiler.name,
+                    handle)
 
         # after compiling the last graph, record the end time
         if graph_index == num_graphs - 1:
@@ -190,7 +211,7 @@ class CompilerManager:
             elapsed = now - compilation_start_time
             compilation_config.compilation_time += elapsed
             if runtime_shape is None:
-                logger.info("Compiling a graph for general shape takes %.2f s",
+                logger.info("Compiling a graph for dynamic shape takes %.2f s",
                             elapsed)
             else:
                 logger.info("Compiling a graph for shape %s takes %.2f s",
@@ -308,7 +329,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
                 i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
             ]
             global compilation_start_time
-            compiled_graph_for_general_shape = self.vllm_backend.\
+            compiled_graph_for_dynamic_shape = self.vllm_backend.\
                 compiler_manager.compile(
                 submod,
                 args,
@@ -323,7 +344,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
             self.module.__dict__[target] = piecewise_backend(
                 submod, self.vllm_config, self.graph_pool, index,
                 len(self.compile_submod_names), sym_shape_indices,
-                compiled_graph_for_general_shape, self.vllm_backend)
+                compiled_graph_for_dynamic_shape, self.vllm_backend)
 
             compilation_counter.num_piecewise_capturable_graphs_seen += 1
 

From afb7cff1b95d58263ae4f0247eaac090bd19cbf4 Mon Sep 17 00:00:00 2001
From: Ming Yang <minos.future@gmail.com>
Date: Mon, 7 Jul 2025 18:07:22 -0700
Subject: [PATCH 146/167] [Bugfix] Fix Maverick correctness by filling zero to
 cache space in cutlass_moe (#20167)

Signed-off-by: Ming Yang <yming@meta.com>
---
 tests/kernels/moe/test_cutlass_moe.py         | 131 +++++++++++++++---
 .../layers/fused_moe/cutlass_moe.py           |   8 +-
 2 files changed, 120 insertions(+), 19 deletions(-)

diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py
index 929db91775375..5fac7166bc262 100644
--- a/tests/kernels/moe/test_cutlass_moe.py
+++ b/tests/kernels/moe/test_cutlass_moe.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import dataclasses
+from math import prod
 from typing import Optional
 
 import pytest
@@ -8,9 +9,12 @@ import torch
 
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
+from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+    cutlass_moe_fp8, run_cutlass_moe_fp8)
 from vllm.model_executor.layers.fused_moe.fused_moe import (fused_experts,
                                                             fused_topk)
+from vllm.model_executor.layers.fused_moe.utils import (
+    moe_kernel_quantize_input)
 from vllm.platforms import current_platform
 
 NUM_EXPERTS = [40, 64]
@@ -236,6 +240,7 @@ def test_cutlass_moe_8_bit_no_graph(
     per_act_token: bool,
     per_out_ch: bool,
     monkeypatch,
+    ep_size: Optional[int] = None,
 ):
     current_platform.seed_everything(7)
     monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
@@ -254,7 +259,13 @@ def test_cutlass_moe_8_bit_no_graph(
         triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights,
                                       topk_ids)
 
-        cutlass_output = run_8_bit(mt, topk_weights, topk_ids, per_act_token)
+        if ep_size is not None:
+            assert e % ep_size == 0, "Cannot distribute experts evenly"
+            number_local_experts = e // ep_size
+        else:
+            number_local_experts = None
+        cutlass_output = run_8_bit(mt, topk_weights, topk_ids, per_act_token,
+                                   number_local_experts)
 
         # Note 5.5 only needed for larger problem sizes, 5 works ok for
         # the rest.
@@ -340,9 +351,62 @@ def test_cutlass_moe_8_bit_EP(
     per_out_channel: bool,
     ep_size: int,
     monkeypatch,
+):
+    test_cutlass_moe_8_bit_no_graph(m, n, k, e, topk, per_act_token,
+                                    per_out_channel, monkeypatch, ep_size)
+
+
+LARGE_MNK_FACTORS = [
+    (1, 8192, 5120, 31),
+    (32768, 1024, 1024, 16),
+    (65536, 512, 1024, 16),
+]
+
+
+@pytest.mark.parametrize("m,n,k,topk", LARGE_MNK_FACTORS)
+@pytest.mark.parametrize("e", [128])
+@pytest.mark.parametrize("per_act_token", [False])
+@pytest.mark.parametrize("per_out_channel", [True])
+@pytest.mark.parametrize("ep_size", [8])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_cutlass_moe_8_bit_EP_large(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_channel: bool,
+    ep_size: int,
+    monkeypatch,
+):
+    test_cutlass_moe_8_bit_no_graph(m, n, k, e, topk, per_act_token,
+                                    per_out_channel, monkeypatch, ep_size)
+
+
+@pytest.mark.parametrize("m,n,k,topk", [(1, 8192, 5120, 31)])
+@pytest.mark.parametrize("e", [128])
+@pytest.mark.parametrize("per_act_token", [False])
+@pytest.mark.parametrize("per_out_channel", [True])
+@pytest.mark.parametrize("ep_size", [8])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_run_cutlass_moe_fp8(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_channel: bool,
+    ep_size: int,
 ):
     current_platform.seed_everything(7)
-    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
     with set_current_vllm_config(vllm_config):
         mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
                                                   per_out_channel)
@@ -352,20 +416,53 @@ def test_cutlass_moe_8_bit_EP(
                                                score,
                                                topk,
                                                renormalize=False)
+        # we want to make sure there is at least one token that's generated in
+        # this expert shard and at least one token that's NOT generated in this
+        # expert shard
+        topk_ids[0][0] = -1
+        topk_ids[0][1] = 1
 
-        # Note that we are using the dequantized versions of the tensors.
-        # Using a, w1 and w2 directly results in minor output differences.
-        triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights,
-                                      topk_ids)
+        workspace13_shape = (m * topk, max(2 * n, k))
+        workspace2_shape = (m * topk, n)
+        output_shape = (m * topk, k)
 
-        assert e % ep_size == 0, "Cannot distribute experts evenly"
-        cutlass_output = run_8_bit(mt,
-                                   topk_weights,
-                                   topk_ids,
-                                   per_act_token,
-                                   num_local_experts=e // ep_size)
+        workspace13 = torch.empty(prod(workspace13_shape),
+                                  device="cuda",
+                                  dtype=mt.a.dtype)
+        workspace2 = torch.empty(prod(workspace2_shape),
+                                 device="cuda",
+                                 dtype=mt.a.dtype)
 
-        torch.testing.assert_close(triton_output,
-                                   cutlass_output,
-                                   atol=5e-2,
-                                   rtol=1e-2)
+        num_local_experts = e // ep_size
+        start, end = 0, num_local_experts
+        expert_map = [-1] * e
+        expert_map[start:end] = list(range(num_local_experts))
+        expert_map = torch.tensor(expert_map, dtype=torch.int32, device="cuda")
+
+        activation = lambda o, i: torch.ops._C.silu_and_mul(o, i)
+        a1q, a1q_scale = moe_kernel_quantize_input(mt.a, mt.a_scale,
+                                                   torch.float8_e4m3fn,
+                                                   per_act_token)
+        global_num_experts = -1 if mt.w1_q is None else mt.w1_q.size(0)
+        func = lambda output: run_cutlass_moe_fp8(
+            output, a1q, mt.w1_q, mt.w2_q, topk_ids, activation,
+            global_num_experts, expert_map, mt.w1_scale, mt.w2_scale,
+            a1q_scale, None, workspace13, workspace2, None, mt.a.dtype,
+            per_act_token, per_out_channel, False)
+
+        workspace13.random_()
+        output_random_workspace = torch.empty(output_shape,
+                                              device="cuda",
+                                              dtype=mt.a.dtype)
+        func(output_random_workspace)
+
+        workspace13.fill_(0)
+        output_zero_workspace = torch.zeros(output_shape,
+                                            device="cuda",
+                                            dtype=mt.a.dtype)
+        func(output_zero_workspace)
+
+        torch.testing.assert_close(output_random_workspace,
+                                   output_zero_workspace,
+                                   atol=5e-3,
+                                   rtol=1e-3)
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index 0f41414c4896d..d771a7a54cfc1 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -180,7 +180,11 @@ def run_cutlass_moe_fp8(
         c2 = _resize_cache(workspace2, (M * topk, N))
         c3 = _resize_cache(workspace13, (M * topk, K))
 
-    c1.fill_(0)
+    if not per_act_token and (expert_map is not None or use_batched_format):
+        # this is necessary to avoid imprecise scale calculation caused by
+        # random data in the unused workspace. The workspace is unused when
+        # this rank handles only partial tokens, or when it is batched .
+        c1.fill_(0)
 
     ops.cutlass_moe_mm(c1, a1q, w1, a1q_scale, w1_scale, expert_offsets,
                        problem_sizes1, ab_strides1, ab_strides1, c_strides1,
@@ -303,7 +307,7 @@ class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute):
     ):
         assert w1_zp is None, "w1_zp is not supported in CUTLASS MoE"
         assert w2_zp is None, "w2_zp is not supported in CUTLASS MoE"
-        activation_callable = lambda i, o: self.activation(activation, i, o)
+        activation_callable = lambda o, i: self.activation(activation, o, i)
         in_dtype = hidden_states.dtype
         run_cutlass_moe_fp8(
             output, hidden_states, w1, w2, topk_ids, activation_callable,

From 31c5d0a1b79b0b4621d71a069990d8af43976b08 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 7 Jul 2025 19:04:54 -0700
Subject: [PATCH 147/167] [Optimize] Don't send token ids when kv connector is
 not used (#20586)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/core/sched/scheduler.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 79ab482bd71df..0c3acea3ae408 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -621,6 +621,7 @@ class Scheduler(SchedulerInterface):
         new_block_ids: list[tuple[list[int], ...]] = []
         num_computed_tokens: list[int] = []
 
+        use_connector = self.connector is not None
         for req in itertools.chain(running_reqs, resumed_reqs):
             req_id = req.request_id
             req_ids.append(req_id)
@@ -635,7 +636,10 @@ class Scheduler(SchedulerInterface):
                 token_ids = req.all_token_ids[req.num_computed_tokens:req.
                                               num_computed_tokens + num_tokens]
                 new_token_ids.append(token_ids)
-            else:
+            elif use_connector:
+                # When using a KVConnector, we add a placeholder to avoid index
+                # out of bounds errors. TODO: Remove this once the KVConnector
+                # is updated to handle token IDs properly.
                 new_token_ids.append([])
             new_block_ids.append(req_to_new_block_ids[req_id])
             num_computed_tokens.append(req.num_computed_tokens)

From af107d5a0e47cd40cce3e35285d36cccf0e7048b Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 8 Jul 2025 03:55:28 +0100
Subject: [PATCH 148/167] Make distinct `code` and `console` admonitions so
 readers are less likely to miss them (#20585)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/cli/README.md                            |  2 +-
 docs/configuration/conserving_memory.md       |  4 +-
 docs/configuration/env_vars.md                |  2 +-
 docs/contributing/README.md                   |  2 +-
 docs/contributing/model/basic.md              |  2 +-
 docs/contributing/model/multimodal.md         | 40 +++++++++----------
 docs/contributing/profiling.md                |  2 +-
 docs/deployment/docker.md                     |  2 +-
 docs/deployment/frameworks/autogen.md         |  2 +-
 docs/deployment/frameworks/cerebrium.md       |  6 +--
 docs/deployment/frameworks/dstack.md          |  6 +--
 docs/deployment/frameworks/haystack.md        |  2 +-
 docs/deployment/frameworks/litellm.md         |  2 +-
 docs/deployment/frameworks/lws.md             |  4 +-
 docs/deployment/frameworks/skypilot.md        | 12 +++---
 .../integrations/production-stack.md          |  6 +--
 docs/deployment/k8s.md                        |  4 +-
 docs/deployment/nginx.md                      |  4 +-
 docs/design/arch_overview.md                  |  4 +-
 docs/design/kernel/paged_attention.md         |  2 +-
 docs/design/plugin_system.md                  |  2 +-
 docs/design/v1/p2p_nccl_connector.md          | 20 +++++-----
 docs/design/v1/torch_compile.md               |  4 +-
 docs/features/lora.md                         |  8 ++--
 docs/features/multimodal_inputs.md            | 20 +++++-----
 docs/features/quantization/auto_awq.md        |  4 +-
 docs/features/quantization/bitblas.md         |  2 +-
 docs/features/quantization/fp8.md             |  2 +-
 docs/features/quantization/gguf.md            |  2 +-
 docs/features/quantization/gptqmodel.md       |  4 +-
 docs/features/quantization/int4.md            |  6 +--
 docs/features/quantization/int8.md            |  4 +-
 docs/features/quantization/modelopt.md        |  4 +-
 .../quantization/quantized_kvcache.md         |  4 +-
 docs/features/quantization/quark.md           | 10 ++---
 docs/features/quantization/torchao.md         |  2 +-
 docs/features/reasoning_outputs.md            | 12 +++---
 docs/features/spec_decode.md                  | 10 ++---
 docs/features/structured_outputs.md           | 16 ++++----
 docs/features/tool_calling.md                 |  4 +-
 docs/getting_started/installation/cpu.md      |  4 +-
 .../installation/gpu/rocm.inc.md              |  4 +-
 .../installation/intel_gaudi.md               |  4 +-
 docs/getting_started/quickstart.md            |  4 +-
 docs/mkdocs/stylesheets/extra.css             | 30 ++++++++++++++
 docs/models/generative_models.md              |  2 +-
 docs/models/supported_models.md               |  2 +-
 docs/serving/integrations/langchain.md        |  2 +-
 docs/serving/openai_compatible_server.md      | 40 +++++++++----------
 docs/usage/metrics.md                         |  4 +-
 docs/usage/troubleshooting.md                 |  6 +--
 docs/usage/usage_stats.md                     |  2 +-
 52 files changed, 192 insertions(+), 162 deletions(-)

diff --git a/docs/cli/README.md b/docs/cli/README.md
index b2587a5e7cd2b..3541437659cac 100644
--- a/docs/cli/README.md
+++ b/docs/cli/README.md
@@ -16,7 +16,7 @@ vllm {chat,complete,serve,bench,collect-env,run-batch}
 
 Start the vLLM OpenAI Compatible API server.
 
-??? Examples
+??? console "Examples"
 
     ```bash
     # Start with a model
diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md
index e2303067e3ee8..2b09498f79007 100644
--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@@ -57,7 +57,7 @@ By default, we optimize model inference using CUDA graphs which take up extra me
 
 You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
 
-??? Code
+??? code
 
     ```python
     from vllm import LLM
@@ -129,7 +129,7 @@ reduce the size of the processed multi-modal inputs, which in turn saves memory.
 
 Here are some examples:
 
-??? Code
+??? code
 
     ```python
     from vllm import LLM
diff --git a/docs/configuration/env_vars.md b/docs/configuration/env_vars.md
index c875931c305b6..2c0a898754fa0 100644
--- a/docs/configuration/env_vars.md
+++ b/docs/configuration/env_vars.md
@@ -7,7 +7,7 @@ vLLM uses the following environment variables to configure the system:
 
     All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).
 
-??? Code
+??? code
 
     ```python
     --8<-- "vllm/envs.py:env-vars-definition"
diff --git a/docs/contributing/README.md b/docs/contributing/README.md
index 83525436be139..f2d439e37ccc6 100644
--- a/docs/contributing/README.md
+++ b/docs/contributing/README.md
@@ -95,7 +95,7 @@ For additional features and advanced configurations, refer to the official [MkDo
 
 ## Testing
 
-??? note "Commands"
+??? console "Commands"
 
     ```bash
     pip install -r requirements/dev.txt
diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md
index d552cd06be204..78289bf381d77 100644
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -27,7 +27,7 @@ All vLLM modules within the model must include a `prefix` argument in their cons
 
 The initialization code should look like this:
 
-??? Code
+??? code
 
     ```python
     from torch import nn
diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md
index 64daa9c2d4cdd..201ace0ab0802 100644
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@@ -12,7 +12,7 @@ Further update the model as follows:
 
 - Implement [get_placeholder_str][vllm.model_executor.models.interfaces.SupportsMultiModal.get_placeholder_str] to define the placeholder string which is used to represent the multi-modal item in the text prompt. This should be consistent with the chat template of the model.
 
-    ??? Code
+    ??? code
 
         ```python
         class YourModelForImage2Seq(nn.Module):
@@ -41,7 +41,7 @@ Further update the model as follows:
 
 - Implement [get_multimodal_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.
 
-    ??? Code
+    ??? code
 
         ```python
         class YourModelForImage2Seq(nn.Module):
@@ -71,7 +71,7 @@ Further update the model as follows:
 
 - Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
 
-    ??? Code
+    ??? code
 
         ```python
         from .utils import merge_multimodal_embeddings
@@ -155,7 +155,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
 
     Looking at the code of HF's `LlavaForConditionalGeneration`:
 
-    ??? Code
+    ??? code
 
         ```python
         # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
@@ -179,7 +179,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
     The number of placeholder feature tokens per image is `image_features.shape[1]`.
     `image_features` is calculated inside the `get_image_features` method:
 
-    ??? Code
+    ??? code
 
         ```python
         # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
@@ -217,7 +217,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
 
     To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`:
 
-    ??? Code
+    ??? code
 
         ```python
         # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
@@ -244,7 +244,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
 
     Overall, the number of placeholder feature tokens for an image can be calculated as:
 
-    ??? Code
+    ??? code
 
         ```python
         def get_num_image_tokens(
@@ -269,7 +269,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
     Notice that the number of image tokens doesn't depend on the image width and height.
     We can simply use a dummy `image_size` to calculate the multimodal profiling data:
 
-    ??? Code
+    ??? code
 
         ```python
         # NOTE: In actuality, this is usually implemented as part of the
@@ -314,7 +314,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
 
     Looking at the code of HF's `FuyuForCausalLM`:
 
-    ??? Code
+    ??? code
 
         ```python
         # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322
@@ -344,7 +344,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
     In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`,
     returning the dimensions after resizing (but before padding) as metadata.
 
-    ??? Code
+    ??? code
 
         ```python
         # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544
@@ -382,7 +382,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
 
     In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata:
 
-    ??? Code
+    ??? code
 
         ```python
         # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425
@@ -420,7 +420,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
 
     The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`:
 
-    ??? Code
+    ??? code
 
         ```python
         # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562
@@ -457,7 +457,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
 
     For the multimodal image profiling data, the logic is very similar to LLaVA:
 
-    ??? Code
+    ??? code
 
         ```python
         def get_dummy_mm_data(
@@ -546,7 +546,7 @@ return a schema of the tensors outputted by the HF processor that are related to
     In order to support the use of [MultiModalFieldConfig.batched][] like in LLaVA,
     we remove the extra batch dimension by overriding [BaseMultiModalProcessor._call_hf_processor][]:
 
-    ??? Code
+    ??? code
 
         ```python
         def _call_hf_processor(
@@ -623,7 +623,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
     It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
     Based on this, we override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] as follows:
 
-    ??? Code
+    ??? code
 
         ```python
         def _get_prompt_updates(
@@ -668,7 +668,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
 
     We define a helper function to return `ncols` and `nrows` directly:
 
-    ??? Code
+    ??? code
 
         ```python
         def get_image_feature_grid_size(
@@ -698,7 +698,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
 
     Based on this, we can initially define our replacement tokens as:
 
-    ??? Code
+    ??? code
 
         ```python
         def get_replacement(item_idx: int):
@@ -718,7 +718,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
     However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called,
     a BOS token (`<s>`) is also added to the promopt:
 
-    ??? Code
+    ??? code
 
         ```python
         # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435
@@ -745,7 +745,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
     To assign the vision embeddings to only the image tokens, instead of a string
     you can return an instance of [PromptUpdateDetails][vllm.multimodal.processing.PromptUpdateDetails]:
 
-    ??? Code
+    ??? code
 
         ```python
         hf_config = self.info.get_hf_config()
@@ -772,7 +772,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
     Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt,
     we can search for it to conduct the replacement at the start of the string:
 
-    ??? Code
+    ??? code
 
         ```python
         def _get_prompt_updates(
diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
index 20f4867057d3e..a5851cfe963d2 100644
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -125,7 +125,7 @@ to manually kill the profiler and generate your `nsys-rep` report.
 
 You can view these profiles either as summaries in the CLI, using `nsys stats [profile-file]`, or in the GUI by installing Nsight [locally following the directions here](https://developer.nvidia.com/nsight-systems/get-started).
 
-??? CLI example
+??? console "CLI example"
 
     ```bash
     nsys stats report1.nsys-rep
diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md
index 5f6a22c28c28e..38633860b6179 100644
--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@@ -97,7 +97,7 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
     flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
     Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
 
-??? Command
+??? console "Command"
 
     ```bash
     # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
diff --git a/docs/deployment/frameworks/autogen.md b/docs/deployment/frameworks/autogen.md
index 13930e67ab2f5..91127bed2854e 100644
--- a/docs/deployment/frameworks/autogen.md
+++ b/docs/deployment/frameworks/autogen.md
@@ -30,7 +30,7 @@ python -m vllm.entrypoints.openai.api_server \
 
 - Call it with AutoGen:
 
-??? Code
+??? code
 
     ```python
     import asyncio
diff --git a/docs/deployment/frameworks/cerebrium.md b/docs/deployment/frameworks/cerebrium.md
index 5c5f2f48d50b7..d47773dd0c86e 100644
--- a/docs/deployment/frameworks/cerebrium.md
+++ b/docs/deployment/frameworks/cerebrium.md
@@ -34,7 +34,7 @@ vllm = "latest"
 
 Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`:
 
-??? Code
+??? code
 
     ```python
     from vllm import LLM, SamplingParams
@@ -64,7 +64,7 @@ cerebrium deploy
 
 If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`)
 
-??? Command
+??? console "Command"
 
     ```python
     curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
@@ -82,7 +82,7 @@ If successful, you should be returned a CURL command that you can call inference
 
 You should get a response like:
 
-??? Response
+??? console "Response"
 
     ```python
     {
diff --git a/docs/deployment/frameworks/dstack.md b/docs/deployment/frameworks/dstack.md
index 8b4bc459683b0..8be655e23a2ea 100644
--- a/docs/deployment/frameworks/dstack.md
+++ b/docs/deployment/frameworks/dstack.md
@@ -26,7 +26,7 @@ dstack init
 
 Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
 
-??? Config
+??? code "Config"
 
     ```yaml
     type: service
@@ -48,7 +48,7 @@ Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-
 
 Then, run the following CLI for provisioning:
 
-??? Command
+??? console "Command"
 
     ```console
     $ dstack run . -f serve.dstack.yml
@@ -79,7 +79,7 @@ Then, run the following CLI for provisioning:
 
 After the provisioning, you can interact with the model by using the OpenAI SDK:
 
-??? Code
+??? code
 
     ```python
     from openai import OpenAI
diff --git a/docs/deployment/frameworks/haystack.md b/docs/deployment/frameworks/haystack.md
index 7a4cab4c2ee35..0a52d017c301d 100644
--- a/docs/deployment/frameworks/haystack.md
+++ b/docs/deployment/frameworks/haystack.md
@@ -27,7 +27,7 @@ vllm serve mistralai/Mistral-7B-Instruct-v0.1
 
 - Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server.
 
-??? Code
+??? code
 
     ```python
     from haystack.components.generators.chat import OpenAIChatGenerator
diff --git a/docs/deployment/frameworks/litellm.md b/docs/deployment/frameworks/litellm.md
index 8279613b1a273..c7cdd1020f2a9 100644
--- a/docs/deployment/frameworks/litellm.md
+++ b/docs/deployment/frameworks/litellm.md
@@ -34,7 +34,7 @@ vllm serve qwen/Qwen1.5-0.5B-Chat
 
 - Call it with litellm:
 
-??? Code
+??? code
 
     ```python
     import litellm 
diff --git a/docs/deployment/frameworks/lws.md b/docs/deployment/frameworks/lws.md
index 9df9528769064..d0ca6d6dd054d 100644
--- a/docs/deployment/frameworks/lws.md
+++ b/docs/deployment/frameworks/lws.md
@@ -17,7 +17,7 @@ vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kuber
 
 Deploy the following yaml file `lws.yaml`
 
-??? Yaml
+??? code "Yaml"
 
     ```yaml
     apiVersion: leaderworkerset.x-k8s.io/v1
@@ -177,7 +177,7 @@ curl http://localhost:8080/v1/completions \
 
 The output should be similar to the following
 
-??? Output
+??? console "Output"
 
     ```text
     {
diff --git a/docs/deployment/frameworks/skypilot.md b/docs/deployment/frameworks/skypilot.md
index ecf987539ced4..a0efc50416b40 100644
--- a/docs/deployment/frameworks/skypilot.md
+++ b/docs/deployment/frameworks/skypilot.md
@@ -24,7 +24,7 @@ sky check
 
 See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml).
 
-??? Yaml
+??? code "Yaml"
 
     ```yaml
     resources:
@@ -95,7 +95,7 @@ HF_TOKEN="your-huggingface-token" \
 
 SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file.
 
-??? Yaml
+??? code "Yaml"
 
     ```yaml
     service:
@@ -111,7 +111,7 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
       max_completion_tokens: 1
     ```
 
-??? Yaml
+??? code "Yaml"
 
     ```yaml
     service:
@@ -186,7 +186,7 @@ vllm          2   1        xx.yy.zz.245  18 mins ago  1x GCP([Spot]{'L4': 1})  R
 
 After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
 
-??? Commands
+??? console "Commands"
 
     ```bash
     ENDPOINT=$(sky serve status --endpoint 8081 vllm)
@@ -220,7 +220,7 @@ service:
 
 This will scale the service up to when the QPS exceeds 2 for each replica.
 
-??? Yaml
+??? code "Yaml"
 
     ```yaml
     service:
@@ -285,7 +285,7 @@ sky serve down vllm
 
 It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
 
-??? Yaml
+??? code "Yaml"
 
     ```yaml
     envs:
diff --git a/docs/deployment/integrations/production-stack.md b/docs/deployment/integrations/production-stack.md
index 2b1cc6f6fee18..d9e77dd343f5f 100644
--- a/docs/deployment/integrations/production-stack.md
+++ b/docs/deployment/integrations/production-stack.md
@@ -60,7 +60,7 @@ And then you can send out a query to the OpenAI-compatible API to check the avai
 curl -o- http://localhost:30080/models
 ```
 
-??? Output
+??? console "Output"
 
     ```json
     {
@@ -89,7 +89,7 @@ curl -X POST http://localhost:30080/completions \
   }'
 ```
 
-??? Output
+??? console "Output"
 
     ```json
     {
@@ -121,7 +121,7 @@ sudo helm uninstall vllm
 
 The core vLLM production stack configuration is managed with YAML. Here is the example configuration used in the installation above:
 
-??? Yaml
+??? code "Yaml"
 
     ```yaml
     servingEngineSpec:
diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md
index f01e3d2fae0eb..84e65603d7b1a 100644
--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@@ -29,7 +29,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
 
 First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
 
-??? Config
+??? console "Config"
 
     ```bash
     cat <<EOF |kubectl apply -f -
@@ -57,7 +57,7 @@ First, create a Kubernetes PVC and Secret for downloading and storing Hugging Fa
 
 Next, start the vLLM server as a Kubernetes Deployment and Service:
 
-??? Config
+??? console "Config"
 
     ```bash
     cat <<EOF |kubectl apply -f -
diff --git a/docs/deployment/nginx.md b/docs/deployment/nginx.md
index 7f09453be0c42..fc8ee3f5e35f5 100644
--- a/docs/deployment/nginx.md
+++ b/docs/deployment/nginx.md
@@ -36,7 +36,7 @@ docker build . -f Dockerfile.nginx --tag nginx-lb
 
 Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`.
 
-??? Config
+??? console "Config"
 
     ```console
     upstream backend {
@@ -95,7 +95,7 @@ Notes:
 - The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus device=ID`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command.
 - Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`.
 
-??? Commands
+??? console "Commands"
 
     ```console
     mkdir -p ~/.cache/huggingface/hub/
diff --git a/docs/design/arch_overview.md b/docs/design/arch_overview.md
index b2ef76c0e7f86..36928369acdd5 100644
--- a/docs/design/arch_overview.md
+++ b/docs/design/arch_overview.md
@@ -22,7 +22,7 @@ server.
 
 Here is a sample of `LLM` class usage:
 
-??? Code
+??? code
 
     ```python
     from vllm import LLM, SamplingParams
@@ -180,7 +180,7 @@ vision-language model.
 
     To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
 
-    ??? Code
+    ??? code
 
         ```python
         class MyOldModel(nn.Module):
diff --git a/docs/design/kernel/paged_attention.md b/docs/design/kernel/paged_attention.md
index ff135a7319603..8c0eb05018e76 100644
--- a/docs/design/kernel/paged_attention.md
+++ b/docs/design/kernel/paged_attention.md
@@ -448,7 +448,7 @@ elements of the entire head for all context tokens. However, overall,
 all results for output have been calculated but are just stored in
 different thread register memory.
 
-??? Code
+??? code
 
     ```cpp
     float* out_smem = reinterpret_cast<float*>(shared_mem);
diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md
index 944f0e680de4d..959c9cefc1c54 100644
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -13,7 +13,7 @@ Plugins are user-registered code that vLLM executes. Given vLLM's architecture (
 
 vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:
 
-??? Code
+??? code
 
     ```python
     # inside `setup.py` file
diff --git a/docs/design/v1/p2p_nccl_connector.md b/docs/design/v1/p2p_nccl_connector.md
index 32cdaacf058ae..b1df93cfc85d3 100644
--- a/docs/design/v1/p2p_nccl_connector.md
+++ b/docs/design/v1/p2p_nccl_connector.md
@@ -61,7 +61,7 @@ To address the above issues, I have designed and developed a local Tensor memory
 
 # Install vLLM
 
-??? Commands
+??? console "Commands"
 
     ```shell
     # Enter the home directory or your working directory.
@@ -106,7 +106,7 @@ python3 disagg_prefill_proxy_xpyd.py &
 
 ### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
 
-??? Command
+??? console "Command"
 
     ```shell
     VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
@@ -128,7 +128,7 @@ python3 disagg_prefill_proxy_xpyd.py &
 
 ### Decode1 (e.g. 10.0.1.3 or 10.0.1.1)
 
-??? Command
+??? console "Command"
 
     ```shell
     VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
@@ -150,7 +150,7 @@ python3 disagg_prefill_proxy_xpyd.py &
 
 ### Decode2 (e.g. 10.0.1.4 or 10.0.1.1)
 
-??? Command
+??? console "Command"
 
     ```shell
     VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
@@ -172,7 +172,7 @@ python3 disagg_prefill_proxy_xpyd.py &
 
 ### Decode3 (e.g. 10.0.1.5 or 10.0.1.1)
 
-??? Command
+??? console "Command"
 
     ```shell
     VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
@@ -203,7 +203,7 @@ python3 disagg_prefill_proxy_xpyd.py &
 
 ### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
 
-??? Command
+??? console "Command"
 
     ```shell
     VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
@@ -225,7 +225,7 @@ python3 disagg_prefill_proxy_xpyd.py &
 
 ### Prefill2 (e.g. 10.0.1.3 or 10.0.1.1)
 
-??? Command
+??? console "Command"
 
     ```shell
     VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
@@ -247,7 +247,7 @@ python3 disagg_prefill_proxy_xpyd.py &
 
 ### Prefill3 (e.g. 10.0.1.4 or 10.0.1.1)
 
-??? Command
+??? console "Command"
 
     ```shell
     VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
@@ -269,7 +269,7 @@ python3 disagg_prefill_proxy_xpyd.py &
 
 ### Decode1 (e.g. 10.0.1.5 or 10.0.1.1)
 
-??? Command
+??? console "Command"
 
     ```shell
     VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
@@ -304,7 +304,7 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \
 
 # Benchmark
 
-??? Command
+??? console "Command"
 
     ```shell
     python3 benchmark_serving.py \
diff --git a/docs/design/v1/torch_compile.md b/docs/design/v1/torch_compile.md
index b65099bd62a25..ea5d8ac212f7a 100644
--- a/docs/design/v1/torch_compile.md
+++ b/docs/design/v1/torch_compile.md
@@ -28,7 +28,7 @@ A unique aspect of vLLM's `torch.compile` integration, is that we guarantee all
 
 In the very verbose logs, we can see:
 
-??? Logs
+??? console "Logs"
 
       ```text
       DEBUG 03-07 03:06:52 [decorators.py:203] Start compiling function <code object forward at 0x7f08acf40c90, file "xxx/vllm/model_executor/models/llama.py", line 339>
@@ -110,7 +110,7 @@ Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At
 
 When all the shapes are known, `torch.compile` can compare different configs, and often find some better configs to run the kernel. For example, we can see the following log:
 
-??? Logs
+??? console "Logs"
 
     ```
     AUTOTUNE mm(8x2048, 2048x3072)
diff --git a/docs/features/lora.md b/docs/features/lora.md
index 4ccc3290e56a2..64d40a72994db 100644
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -29,7 +29,7 @@ We can now submit the prompts and call `llm.generate` with the `lora_request` pa
 of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and
 the third parameter is the path to the LoRA adapter.
 
-??? Code
+??? code
 
     ```python
     sampling_params = SamplingParams(
@@ -70,7 +70,7 @@ The server entrypoint accepts all other LoRA configuration parameters (`max_lora
 etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
 with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.):
 
-??? Command
+??? console "Command"
 
     ```bash
     curl localhost:8000/v1/models | jq .
@@ -172,7 +172,7 @@ Alternatively, follow these example steps to implement your own plugin:
 
 1. Implement the LoRAResolver interface.
 
-    ??? Example of a simple S3 LoRAResolver implementation
+    ??? code "Example of a simple S3 LoRAResolver implementation"
 
         ```python
         import os
@@ -238,7 +238,7 @@ The new format of `--lora-modules` is mainly to support the display of parent mo
 - The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
 - The `root` field points to the artifact location of the lora adapter.
 
-??? Command output
+??? console "Command output"
 
     ```bash
     $ curl http://localhost:8000/v1/models
diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index ed11d28360378..7c25f6f406a3f 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -20,7 +20,7 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:
 
 You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
 
-??? Code
+??? code
 
     ```python
     from vllm import LLM
@@ -68,7 +68,7 @@ Full example: <gh-file:examples/offline_inference/vision_language.py>
 
 To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
 
-??? Code
+??? code
 
     ```python
     from vllm import LLM
@@ -146,7 +146,7 @@ for o in outputs:
 
 Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
 
-??? Code
+??? code
 
     ```python
     from vllm import LLM
@@ -193,7 +193,7 @@ Full example: <gh-file:examples/offline_inference/audio_language.py>
 To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
 pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
 
-??? Code
+??? code
 
     ```python
     from vllm import LLM
@@ -220,7 +220,7 @@ pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the cor
 
 For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings:
 
-??? Code
+??? code
 
     ```python
     # Construct the prompt based on your model
@@ -288,7 +288,7 @@ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
 
 Then, you can use the OpenAI client as follows:
 
-??? Code
+??? code
 
     ```python
     from openai import OpenAI
@@ -366,7 +366,7 @@ vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model
 
 Then, you can use the OpenAI client as follows:
 
-??? Code
+??? code
 
     ```python
     from openai import OpenAI
@@ -430,7 +430,7 @@ vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b
 
 Then, you can use the OpenAI client as follows:
 
-??? Code
+??? code
 
     ```python
     import base64
@@ -486,7 +486,7 @@ Then, you can use the OpenAI client as follows:
 
 Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input:
 
-??? Code
+??? code
 
     ```python
     chat_completion_from_url = client.chat.completions.create(
@@ -531,7 +531,7 @@ pass a tensor of shape to the corresponding field of the multi-modal dictionary.
 For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field.
 The following example demonstrates how to pass image embeddings to the OpenAI server:
 
-??? Code
+??? code
 
     ```python
     image_embedding = torch.load(...)
diff --git a/docs/features/quantization/auto_awq.md b/docs/features/quantization/auto_awq.md
index 9f97ea406e25f..2361a27a499dd 100644
--- a/docs/features/quantization/auto_awq.md
+++ b/docs/features/quantization/auto_awq.md
@@ -15,7 +15,7 @@ pip install autoawq
 
 After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
 
-??? Code
+??? code
 
     ```python
     from awq import AutoAWQForCausalLM
@@ -51,7 +51,7 @@ python examples/offline_inference/llm_engine_example.py \
 
 AWQ models are also supported directly through the LLM entrypoint:
 
-??? Code
+??? code
 
     ```python
     from vllm import LLM, SamplingParams
diff --git a/docs/features/quantization/bitblas.md b/docs/features/quantization/bitblas.md
index c8f874ff84147..d1a431ddc9319 100644
--- a/docs/features/quantization/bitblas.md
+++ b/docs/features/quantization/bitblas.md
@@ -43,7 +43,7 @@ llm = LLM(
 
 ## Read gptq format checkpoint
 
-??? Code
+??? code
 
     ```python
     from vllm import LLM
diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md
index b9ed668b2ef31..65b4285a5418b 100644
--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@@ -58,7 +58,7 @@ For FP8 quantization, we can recover accuracy with simple RTN quantization. We r
 
 Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.
 
-??? Code
+??? code
 
     ```python
     from llmcompressor.transformers import oneshot
diff --git a/docs/features/quantization/gguf.md b/docs/features/quantization/gguf.md
index 102a3ee1ccccb..60b3bcd2a5aae 100644
--- a/docs/features/quantization/gguf.md
+++ b/docs/features/quantization/gguf.md
@@ -41,7 +41,7 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
 
 You can also use the GGUF model directly through the LLM entrypoint:
 
-??? Code
+??? code
 
       ```python
       from vllm import LLM, SamplingParams
diff --git a/docs/features/quantization/gptqmodel.md b/docs/features/quantization/gptqmodel.md
index 37bb02d4fb5bf..500803c208a40 100644
--- a/docs/features/quantization/gptqmodel.md
+++ b/docs/features/quantization/gptqmodel.md
@@ -31,7 +31,7 @@ After installing GPTQModel, you are ready to quantize a model. Please refer to t
 
 Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
 
-??? Code
+??? code
 
     ```python
     from datasets import load_dataset
@@ -69,7 +69,7 @@ python examples/offline_inference/llm_engine_example.py \
 
 GPTQModel quantized models are also supported directly through the LLM entrypoint:
 
-??? Code
+??? code
 
     ```python
     from vllm import LLM, SamplingParams
diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md
index 2008bef5c8a25..8d9fe46818ebf 100644
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@@ -53,7 +53,7 @@ When quantizing weights to INT4, you need sample data to estimate the weight upd
 It's best to use calibration data that closely matches your deployment data.
 For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
 
-??? Code
+??? code
 
     ```python
     from datasets import load_dataset
@@ -78,7 +78,7 @@ For a general-purpose instruction-tuned model, you can use a dataset like `ultra
 
 Now, apply the quantization algorithms:
 
-??? Code
+??? code
 
     ```python
     from llmcompressor.transformers import oneshot
@@ -141,7 +141,7 @@ lm_eval --model vllm \
 
 The following is an example of an expanded quantization recipe you can tune to your own use case:
 
-??? Code
+??? code
 
     ```python
     from compressed_tensors.quantization import (
diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md
index 3a8f855aa0577..3635e841b8148 100644
--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@@ -54,7 +54,7 @@ When quantizing activations to INT8, you need sample data to estimate the activa
 It's best to use calibration data that closely matches your deployment data.
 For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
 
-??? Code
+??? code
 
     ```python
     from datasets import load_dataset
@@ -81,7 +81,7 @@ For a general-purpose instruction-tuned model, you can use a dataset like `ultra
 
 Now, apply the quantization algorithms:
 
-??? Code
+??? code
 
     ```python
     from llmcompressor.transformers import oneshot
diff --git a/docs/features/quantization/modelopt.md b/docs/features/quantization/modelopt.md
index 39f2a78e705fc..39ae03b1bdac0 100644
--- a/docs/features/quantization/modelopt.md
+++ b/docs/features/quantization/modelopt.md
@@ -14,7 +14,7 @@ You can quantize HuggingFace models using the example scripts provided in the Te
 
 Below is an example showing how to quantize a model using modelopt's PTQ API:
 
-??? Code
+??? code
 
     ```python
     import modelopt.torch.quantization as mtq
@@ -50,7 +50,7 @@ with torch.inference_mode():
 
 The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM:
 
-??? Code
+??? code
 
     ```python
     from vllm import LLM, SamplingParams
diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md
index 323dcb7d052d0..e76547d0e9c68 100644
--- a/docs/features/quantization/quantized_kvcache.md
+++ b/docs/features/quantization/quantized_kvcache.md
@@ -35,7 +35,7 @@ Studies have shown that FP8 E4M3 quantization typically only minimally degrades
 
 Here is an example of how to enable FP8 quantization:
 
-??? Code
+??? code
 
     ```python
     # To calculate kv cache scales on the fly enable the calculate_kv_scales
@@ -73,7 +73,7 @@ pip install llmcompressor
 
 Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models can use this same pattern):
 
-??? Code
+??? code
 
     ```python
     from datasets import load_dataset
diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md
index 77e3834954062..13afbc1e058e2 100644
--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@@ -42,7 +42,7 @@ The Quark quantization process can be listed for 5 steps as below:
 Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index)
 to fetch model and tokenizer.
 
-??? Code
+??? code
 
     ```python
     from transformers import AutoTokenizer, AutoModelForCausalLM
@@ -65,7 +65,7 @@ Quark uses the [PyTorch Dataloader](https://pytorch.org/tutorials/beginner/basic
 to load calibration data. For more details about how to use calibration datasets efficiently, please refer
 to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html).
 
-??? Code
+??? code
 
     ```python
     from datasets import load_dataset
@@ -98,7 +98,7 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
     AutoSmoothQuant config file for Llama is
     `examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`.
 
-??? Code
+??? code
 
     ```python
     from quark.torch.quantization import (Config, QuantizationConfig,
@@ -145,7 +145,7 @@ HuggingFace `safetensors`, you can refer to
 [HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html)
 for more exporting format details.
 
-??? Code
+??? code
 
     ```python
     import torch
@@ -176,7 +176,7 @@ for more exporting format details.
 
 Now, you can load and run the Quark quantized model directly through the LLM entrypoint:
 
-??? Code
+??? code
 
     ```python
     from vllm import LLM, SamplingParams
diff --git a/docs/features/quantization/torchao.md b/docs/features/quantization/torchao.md
index f8df3c4b08096..ab6802177048b 100644
--- a/docs/features/quantization/torchao.md
+++ b/docs/features/quantization/torchao.md
@@ -15,7 +15,7 @@ pip install \
 ## Quantizing HuggingFace Models
 You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code:
 
-??? Code
+??? code
 
     ```Python
     import torch
diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index 2e6afe61663cb..90232a536cccc 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -33,7 +33,7 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
 
 Next, make a request to the model that should return the reasoning content in the response.
 
-??? Code
+??? code
 
     ```python
     from openai import OpenAI
@@ -70,7 +70,7 @@ The `reasoning_content` field contains the reasoning steps that led to the final
 
 Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming).
 
-??? Json
+??? console "Json"
 
     ```json
     {
@@ -95,7 +95,7 @@ Streaming chat completions are also supported for reasoning models. The `reasoni
 
 OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example:
 
-??? Code
+??? code
 
     ```python
     from openai import OpenAI
@@ -152,7 +152,7 @@ Remember to check whether the `reasoning_content` exists in the response before
 
 The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`.
 
-??? Code
+??? code
 
     ```python
     from openai import OpenAI
@@ -200,7 +200,7 @@ For more examples, please refer to <gh-file:examples/online_serving/openai_chat_
 
 You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
 
-??? Code
+??? code
 
     ```python
     # import the required packages
@@ -258,7 +258,7 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
 
 Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
 
-??? Code
+??? code
 
     ```python
     @dataclass
diff --git a/docs/features/spec_decode.md b/docs/features/spec_decode.md
index f28a74ce2262a..e22cc65cae99e 100644
--- a/docs/features/spec_decode.md
+++ b/docs/features/spec_decode.md
@@ -18,7 +18,7 @@ Speculative decoding is a technique which improves inter-token latency in memory
 
 The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
 
-??? Code
+??? code
 
     ```python
     from vllm import LLM, SamplingParams
@@ -62,7 +62,7 @@ python -m vllm.entrypoints.openai.api_server \
 
 Then use a client:
 
-??? Code
+??? code
 
     ```python
     from openai import OpenAI
@@ -103,7 +103,7 @@ Then use a client:
 The following code configures vLLM to use speculative decoding where proposals are generated by
 matching n-grams in the prompt. For more information read [this thread.](https://x.com/joao_gante/status/1747322413006643259)
 
-??? Code
+??? code
 
     ```python
     from vllm import LLM, SamplingParams
@@ -137,7 +137,7 @@ draft models that conditioning draft predictions on both context vectors and sam
 For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or
 [this technical report](https://arxiv.org/abs/2404.19124).
 
-??? Code
+??? code
 
     ```python
     from vllm import LLM, SamplingParams
@@ -185,7 +185,7 @@ A variety of speculative models of this type are available on HF hub:
 The following code configures vLLM to use speculative decoding where proposals are generated by
 an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py).
 
-??? Code
+??? code
 
     ```python
     from vllm import LLM, SamplingParams
diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
index ea1d09644835f..c56ad400819bc 100644
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -33,7 +33,7 @@ text.
 
 Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
 
-??? Code
+??? code
 
     ```python
     from openai import OpenAI
@@ -55,7 +55,7 @@ Now let´s see an example for each of the cases, starting with the `guided_choic
 
 The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:
 
-??? Code
+??? code
 
     ```python
     completion = client.chat.completions.create(
@@ -79,7 +79,7 @@ For this we can use the `guided_json` parameter in two different ways:
 
 The next example shows how to use the `guided_json` parameter with a Pydantic model:
 
-??? Code
+??? code
 
     ```python
     from pydantic import BaseModel
@@ -127,7 +127,7 @@ difficult to use, but it´s really powerful. It allows us to define complete
 languages like SQL queries. It works by using a context free EBNF grammar.
 As an example, we can use to define a specific format of simplified SQL queries:
 
-??? Code
+??? code
 
     ```python
     simplified_sql_grammar = """
@@ -169,7 +169,7 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r
 
 Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema:
 
-??? Code
+??? code
 
     ```python
     from pydantic import BaseModel
@@ -212,7 +212,7 @@ For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3.
 
 Here is a simple example demonstrating how to get structured output using Pydantic models:
 
-??? Code
+??? code
 
     ```python
     from pydantic import BaseModel
@@ -248,7 +248,7 @@ Age: 28
 
 Here is a more complex example using nested Pydantic models to handle a step-by-step math solution:
 
-??? Code
+??? code
 
     ```python
     from typing import List
@@ -308,7 +308,7 @@ These parameters can be used in the same way as the parameters from the Online
 Serving examples above. One example for the usage of the `choice` parameter is
 shown below:
 
-??? Code
+??? code
 
     ```python
     from vllm import LLM, SamplingParams
diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 8858b9a4015a4..13a8386a29719 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -15,7 +15,7 @@ vllm serve meta-llama/Llama-3.1-8B-Instruct \
 
 Next, make a request to the model that should result in it using the available tools:
 
-??? Code
+??? code
 
     ```python
     from openai import OpenAI
@@ -320,7 +320,7 @@ A tool parser plugin is a Python file containing one or more ToolParser implemen
 
 Here is a summary of a plugin file:
 
-??? Code
+??? code
 
     ```python
 
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index 5f2d0dbe27d34..15f183bccaa12 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -76,7 +76,7 @@ Currently, there are no pre-built CPU wheels.
 
 ### Build image from source
 
-??? Commands
+??? console "Commands"
 
     ```bash
     docker build -f docker/Dockerfile.cpu \
@@ -149,7 +149,7 @@ vllm serve facebook/opt-125m
 
 - If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND` or using auto thread binding feature by default. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
 
-??? Commands
+??? console "Commands"
 
     ```console
     $ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md
index 3765807ba21d5..560883d3caf9e 100644
--- a/docs/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/getting_started/installation/gpu/rocm.inc.md
@@ -95,7 +95,7 @@ Currently, there are no pre-built ROCm wheels.
 
 4. Build vLLM. For example, vLLM on ROCM 6.3 can be built with the following steps:
 
-    ??? Commands
+    ??? console "Commands"
 
         ```bash
         pip install --upgrade pip
@@ -206,7 +206,7 @@ DOCKER_BUILDKIT=1 docker build \
 
 To run the above docker image `vllm-rocm`, use the below command:
 
-??? Command
+??? console "Command"
 
     ```bash
     docker run -it \
diff --git a/docs/getting_started/installation/intel_gaudi.md b/docs/getting_started/installation/intel_gaudi.md
index 7a7a5a51c24cb..e1bba1eaba4a3 100644
--- a/docs/getting_started/installation/intel_gaudi.md
+++ b/docs/getting_started/installation/intel_gaudi.md
@@ -237,7 +237,7 @@ As an example, if a request of 3 sequences, with max sequence length of 412 come
 
 Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
 
-??? Logs
+??? console "Logs"
 
     ```text
     INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
@@ -286,7 +286,7 @@ When there's large amount of requests pending, vLLM scheduler will attempt to fi
 
 Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
 
-??? Logs
+??? console "Logs"
 
     ```text
     INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index 39100e4ca5405..216e93ac05b89 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -147,7 +147,7 @@ curl http://localhost:8000/v1/completions \
 
 Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package:
 
-??? Code
+??? code
 
     ```python
     from openai import OpenAI
@@ -186,7 +186,7 @@ curl http://localhost:8000/v1/chat/completions \
 
 Alternatively, you can use the `openai` Python package:
 
-??? Code
+??? code
 
     ```python
     from openai import OpenAI
diff --git a/docs/mkdocs/stylesheets/extra.css b/docs/mkdocs/stylesheets/extra.css
index 5df9f1344012f..fb44d9cdcf3d3 100644
--- a/docs/mkdocs/stylesheets/extra.css
+++ b/docs/mkdocs/stylesheets/extra.css
@@ -39,6 +39,8 @@ body[data-md-color-scheme="slate"] .md-nav__item--section > label.md-nav__link .
 :root {
   --md-admonition-icon--announcement: url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16" width="16" height="16"><path d="M3.25 9a.75.75 0 0 1 .75.75c0 2.142.456 3.828.733 4.653a.122.122 0 0 0 .05.064.212.212 0 0 0 .117.033h1.31c.085 0 .18-.042.258-.152a.45.45 0 0 0 .075-.366A16.743 16.743 0 0 1 6 9.75a.75.75 0 0 1 1.5 0c0 1.588.25 2.926.494 3.85.293 1.113-.504 2.4-1.783 2.4H4.9c-.686 0-1.35-.41-1.589-1.12A16.4 16.4 0 0 1 2.5 9.75.75.75 0 0 1 3.25 9Z"></path><path d="M0 6a4 4 0 0 1 4-4h2.75a.75.75 0 0 1 .75.75v6.5a.75.75 0 0 1-.75.75H4a4 4 0 0 1-4-4Zm4-2.5a2.5 2.5 0 1 0 0 5h2v-5Z"></path><path d="M15.59.082A.75.75 0 0 1 16 .75v10.5a.75.75 0 0 1-1.189.608l-.002-.001h.001l-.014-.01a5.775 5.775 0 0 0-.422-.25 10.63 10.63 0 0 0-1.469-.64C11.576 10.484 9.536 10 6.75 10a.75.75 0 0 1 0-1.5c2.964 0 5.174.516 6.658 1.043.423.151.787.302 1.092.443V2.014c-.305.14-.669.292-1.092.443C11.924 2.984 9.713 3.5 6.75 3.5a.75.75 0 0 1 0-1.5c2.786 0 4.826-.484 6.155-.957.665-.236 1.154-.47 1.47-.64.144-.077.284-.161.421-.25l.014-.01a.75.75 0 0 1 .78-.061Z"></path></svg>');
   --md-admonition-icon--important: url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16" width="16" height="16"><path d="M4.47.22A.749.749 0 0 1 5 0h6c.199 0 .389.079.53.22l4.25 4.25c.141.14.22.331.22.53v6a.749.749 0 0 1-.22.53l-4.25 4.25A.749.749 0 0 1 11 16H5a.749.749 0 0 1-.53-.22L.22 11.53A.749.749 0 0 1 0 11V5c0-.199.079-.389.22-.53Zm.84 1.28L1.5 5.31v5.38l3.81 3.81h5.38l3.81-3.81V5.31L10.69 1.5ZM8 4a.75.75 0 0 1 .75.75v3.5a.75.75 0 0 1-1.5 0v-3.5A.75.75 0 0 1 8 4Zm0 8a1 1 0 1 1 0-2 1 1 0 0 1 0 2Z"></path></svg>');
+  --md-admonition-icon--code: url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16"><path d="m11.28 3.22 4.25 4.25a.75.75 0 0 1 0 1.06l-4.25 4.25a.749.749 0 0 1-1.275-.326.75.75 0 0 1 .215-.734L13.94 8l-3.72-3.72a.749.749 0 0 1 .326-1.275.75.75 0 0 1 .734.215m-6.56 0a.75.75 0 0 1 1.042.018.75.75 0 0 1 .018 1.042L2.06 8l3.72 3.72a.749.749 0 0 1-.326 1.275.75.75 0 0 1-.734-.215L.47 8.53a.75.75 0 0 1 0-1.06Z"/></svg>');
+  --md-admonition-icon--console: url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16"><path d="M0 2.75C0 1.784.784 1 1.75 1h12.5c.966 0 1.75.784 1.75 1.75v10.5A1.75 1.75 0 0 1 14.25 15H1.75A1.75 1.75 0 0 1 0 13.25Zm1.75-.25a.25.25 0 0 0-.25.25v10.5c0 .138.112.25.25.25h12.5a.25.25 0 0 0 .25-.25V2.75a.25.25 0 0 0-.25-.25ZM7.25 8a.75.75 0 0 1-.22.53l-2.25 2.25a.749.749 0 0 1-1.275-.326.75.75 0 0 1 .215-.734L5.44 8 3.72 6.28a.749.749 0 0 1 .326-1.275.75.75 0 0 1 .734.215l2.25 2.25c.141.14.22.331.22.53m1.5 1.5h3a.75.75 0 0 1 0 1.5h-3a.75.75 0 0 1 0-1.5"/></svg>');
 }
 
 .md-typeset .admonition.announcement,
@@ -49,6 +51,14 @@ body[data-md-color-scheme="slate"] .md-nav__item--section > label.md-nav__link .
 .md-typeset details.important {
   border-color: rgb(239, 85, 82);
 }
+.md-typeset .admonition.code,
+.md-typeset details.code {
+  border-color: #64dd17
+}
+.md-typeset .admonition.console,
+.md-typeset details.console {
+  border-color: #64dd17
+}
 
 .md-typeset .announcement > .admonition-title,
 .md-typeset .announcement > summary {
@@ -58,6 +68,14 @@ body[data-md-color-scheme="slate"] .md-nav__item--section > label.md-nav__link .
 .md-typeset .important > summary {
   background-color: rgb(239, 85, 82, 0.1);
 }
+.md-typeset .code > .admonition-title,
+.md-typeset .code > summary {
+  background-color: #64dd171a;
+}
+.md-typeset .console > .admonition-title,
+.md-typeset .console > summary {
+  background-color: #64dd171a;
+}
 
 .md-typeset .announcement > .admonition-title::before,
 .md-typeset .announcement > summary::before {
@@ -71,6 +89,18 @@ body[data-md-color-scheme="slate"] .md-nav__item--section > label.md-nav__link .
   -webkit-mask-image: var(--md-admonition-icon--important);
           mask-image: var(--md-admonition-icon--important);
 }
+.md-typeset .code > .admonition-title::before,
+.md-typeset .code > summary::before {
+  background-color: #64dd17;
+  -webkit-mask-image: var(--md-admonition-icon--code);
+          mask-image: var(--md-admonition-icon--code);
+}
+.md-typeset .console > .admonition-title::before,
+.md-typeset .console > summary::before {
+  background-color: #64dd17;
+  -webkit-mask-image: var(--md-admonition-icon--console);
+          mask-image: var(--md-admonition-icon--console);
+}
 
 /* Make label fully visible on hover */
 .md-content__button[href*="edit"]:hover::after {
diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md
index fd5c659921de3..53469245f01b1 100644
--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@@ -85,7 +85,7 @@ and automatically applies the model's [chat template](https://huggingface.co/doc
     In general, only instruction-tuned models have a chat template.
     Base models may perform poorly as they are not trained to respond to the chat conversation.
 
-??? Code
+??? code
 
     ```python
     from vllm import LLM
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index f427968c8258f..dd9672cc8ab4b 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -642,7 +642,7 @@ Specified using `--task generate`.
 
     For the best results, we recommend using the following dependency versions (tested on A10 and L40):
 
-    ??? Dependency versions
+    ??? code "Dependency versions"
 
         ```text
         # Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40)
diff --git a/docs/serving/integrations/langchain.md b/docs/serving/integrations/langchain.md
index 1a24ab29c19c5..4783d4fa0b426 100644
--- a/docs/serving/integrations/langchain.md
+++ b/docs/serving/integrations/langchain.md
@@ -13,7 +13,7 @@ pip install langchain langchain_community -q
 
 To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`.
 
-??? Code
+??? code
 
     ```python
     from langchain_community.llms import VLLM
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 5371e45d80529..2d6e064a3fa8c 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -15,7 +15,7 @@ vllm serve NousResearch/Meta-Llama-3-8B-Instruct \
 
 To call the server, in your preferred text editor, create a script that uses an HTTP client. Include any messages that you want to send to the model. Then run that script. Below is an example script using the [official OpenAI Python client](https://github.com/openai/openai-python).
 
-??? Code
+??? code
 
     ```python
     from openai import OpenAI
@@ -146,7 +146,7 @@ completion = client.chat.completions.create(
 Only `X-Request-Id` HTTP request header is supported for now. It can be enabled
 with `--enable-request-id-headers`.
 
-??? Code
+??? code
 
     ```python
     completion = client.chat.completions.create(
@@ -185,7 +185,7 @@ Code example: <gh-file:examples/online_serving/openai_completion_client.py>
 
 The following [sampling parameters][sampling-params] are supported.
 
-??? Code
+??? code
 
     ```python
     --8<-- "vllm/entrypoints/openai/protocol.py:completion-sampling-params"
@@ -193,7 +193,7 @@ The following [sampling parameters][sampling-params] are supported.
 
 The following extra parameters are supported:
 
-??? Code
+??? code
 
     ```python
     --8<-- "vllm/entrypoints/openai/protocol.py:completion-extra-params"
@@ -217,7 +217,7 @@ Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
 
 The following [sampling parameters][sampling-params] are supported.
 
-??? Code
+??? code
 
     ```python
     --8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-sampling-params"
@@ -225,7 +225,7 @@ The following [sampling parameters][sampling-params] are supported.
 
 The following extra parameters are supported:
 
-??? Code
+??? code
 
     ```python
     --8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params"
@@ -268,7 +268,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
 
     Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
 
-    ??? Code
+    ??? code
 
         ```python
         import requests
@@ -327,7 +327,7 @@ The following [pooling parameters][pooling-params] are supported.
 
 The following extra parameters are supported by default:
 
-??? Code
+??? code
 
     ```python
     --8<-- "vllm/entrypoints/openai/protocol.py:embedding-extra-params"
@@ -335,7 +335,7 @@ The following extra parameters are supported by default:
 
 For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead:
 
-??? Code
+??? code
 
     ```python
     --8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params"
@@ -358,7 +358,7 @@ Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
 
 The following [sampling parameters][sampling-params] are supported.
 
-??? Code
+??? code
 
     ```python
     --8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params"
@@ -366,7 +366,7 @@ The following [sampling parameters][sampling-params] are supported.
 
 The following extra parameters are supported:
 
-??? Code
+??? code
 
     ```python
     --8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
@@ -446,7 +446,7 @@ curl -v "http://127.0.0.1:8000/classify" \
   }'
 ```
 
-??? Response
+??? console "Response"
 
     ```bash
     {
@@ -494,7 +494,7 @@ curl -v "http://127.0.0.1:8000/classify" \
   }'
 ```
 
-??? Response
+??? console "Response"
 
     ```bash
     {
@@ -564,7 +564,7 @@ curl -X 'POST' \
 }'
 ```
 
-??? Response
+??? console "Response"
 
     ```bash
     {
@@ -589,7 +589,7 @@ You can pass a string to `text_1` and a list to `text_2`, forming multiple sente
 where each pair is built from `text_1` and a string in `text_2`.
 The total number of pairs is `len(text_2)`.
 
-??? Request
+??? console "Request"
 
     ```bash
     curl -X 'POST' \
@@ -606,7 +606,7 @@ The total number of pairs is `len(text_2)`.
     }'
     ```
 
-??? Response
+??? console "Response"
 
     ```bash
     {
@@ -634,7 +634,7 @@ You can pass a list to both `text_1` and `text_2`, forming multiple sentence pai
 where each pair is built from a string in `text_1` and the corresponding string in `text_2` (similar to `zip()`).
 The total number of pairs is `len(text_2)`.
 
-??? Request
+??? console "Request"
 
     ```bash
     curl -X 'POST' \
@@ -655,7 +655,7 @@ The total number of pairs is `len(text_2)`.
     }'
     ```
 
-??? Response
+??? console "Response"
 
     ```bash
     {
@@ -716,7 +716,7 @@ Code example: <gh-file:examples/online_serving/jinaai_rerank_client.py>
 Note that the `top_n` request parameter is optional and will default to the length of the `documents` field.
 Result documents will be sorted by relevance, and the `index` property can be used to determine original order.
 
-??? Request
+??? console "Request"
 
     ```bash
     curl -X 'POST' \
@@ -734,7 +734,7 @@ Result documents will be sorted by relevance, and the `index` property can be us
     }'
     ```
 
-??? Response
+??? console "Response"
 
     ```bash
     {
diff --git a/docs/usage/metrics.md b/docs/usage/metrics.md
index 4350ab5025f5c..fa379003c0b2b 100644
--- a/docs/usage/metrics.md
+++ b/docs/usage/metrics.md
@@ -12,7 +12,7 @@ vllm serve unsloth/Llama-3.2-1B-Instruct
 
 Then query the endpoint to get the latest metrics from the server:
 
-??? Output
+??? console "Output"
 
     ```console
     $ curl http://0.0.0.0:8000/metrics
@@ -33,7 +33,7 @@ Then query the endpoint to get the latest metrics from the server:
 
 The following metrics are exposed:
 
-??? Code
+??? code
 
     ```python
     --8<-- "vllm/engine/metrics.py:metrics-definitions"
diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md
index 2b7abc7f46dff..2d008488ad1eb 100644
--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@@ -60,7 +60,7 @@ To identify the particular CUDA operation that causes the error, you can add `--
 
 If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
 
-??? Code
+??? code
 
     ```python
     # Test PyTorch NCCL
@@ -170,7 +170,7 @@ WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
 
 or an error from Python that looks like this:
 
-??? Logs
+??? console "Logs"
 
     ```console
     RuntimeError:
@@ -214,7 +214,7 @@ if __name__ == '__main__':
 
 vLLM heavily depends on `torch.compile` to optimize the model for better performance, which introduces the dependency on the `torch.compile` functionality and the `triton` library. By default, we use `torch.compile` to [optimize some functions](gh-pr:10406) in the model. Before running vLLM, you can check if `torch.compile` is working as expected by running the following script:
 
-??? Code
+??? code
 
     ```python
     import torch
diff --git a/docs/usage/usage_stats.md b/docs/usage/usage_stats.md
index 78d2a6784bc5a..e78c67522f61b 100644
--- a/docs/usage/usage_stats.md
+++ b/docs/usage/usage_stats.md
@@ -10,7 +10,7 @@ The list of data collected by the latest version of vLLM can be found here: <gh-
 
 Here is an example as of v0.4.0:
 
-??? Output
+??? console "Output"
 
     ```json
     {

From 93b9d9f499982b723c975ba7066af533afd04f08 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Tue, 8 Jul 2025 11:02:15 +0800
Subject: [PATCH 149/167] [Bugfix]: Fix messy code when using logprobs (#19209)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 tests/test_utils.py                          | 14 ++++++++++++++
 tests/v1/engine/test_output_processor.py     |  2 +-
 vllm/transformers_utils/detokenizer_utils.py | 12 ++++++++++--
 3 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index a165d2d7213a7..f90715fd75130 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -14,9 +14,12 @@ from unittest.mock import patch
 import pytest
 import torch
 import zmq
+from transformers import AutoTokenizer
 from vllm_test_utils.monitor import monitor
 
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.transformers_utils.detokenizer_utils import (
+    convert_ids_list_to_tokens)
 from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache,
                         MemorySnapshot, PlaceholderModule, StoreBoolean,
                         bind_kv_cache, common_broadcastable_dtype,
@@ -918,3 +921,14 @@ def test_split_host_port():
 def test_join_host_port():
     assert join_host_port("127.0.0.1", 5555) == "127.0.0.1:5555"
     assert join_host_port("::1", 5555) == "[::1]:5555"
+
+
+def test_convert_ids_list_to_tokens():
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
+    token_ids = tokenizer.encode("Hello, world!")
+    # token_ids = [9707, 11, 1879, 0]
+    assert tokenizer.convert_ids_to_tokens(token_ids) == [
+        'Hello', ',', 'Ġworld', '!'
+    ]
+    tokens = convert_ids_list_to_tokens(tokenizer, token_ids)
+    assert tokens == ['Hello', ',', ' world', '!']
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 1c8c5f25e29be..949ab764e2e96 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -35,7 +35,7 @@ def _ref_convert_id_to_token(
     Returns:
       String representation of input token id
     """
-    return tokenizer.convert_ids_to_tokens(token_id) or ""
+    return tokenizer.decode([token_id]) or ""
 
 
 @pytest.mark.parametrize(
diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py
index 342632989d579..6812cda7110f5 100644
--- a/vllm/transformers_utils/detokenizer_utils.py
+++ b/vllm/transformers_utils/detokenizer_utils.py
@@ -78,6 +78,7 @@ def convert_prompt_ids_to_tokens(
 def convert_ids_list_to_tokens(
     tokenizer: AnyTokenizer,
     token_ids: list[int],
+    skip_special_tokens: bool = False,
 ) -> list[str]:
     """Detokenize the input ids individually.
 
@@ -89,8 +90,15 @@ def convert_ids_list_to_tokens(
       Python list of token string representations
     
     """
-    token_str_lst = tokenizer.convert_ids_to_tokens(token_ids)
-    _replace_none_with_empty(token_str_lst)  # type: ignore
+    token_str_lst = []
+    for token_id in token_ids:
+        token_str = tokenizer.decode(
+            [token_id],
+            skip_special_tokens=skip_special_tokens,
+        )
+        if token_str is None:
+            token_str = ""
+        token_str_lst.append(token_str)
     return token_str_lst
 
 

From 6e428cdd7a5d986f954452fb337821942afb0844 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 8 Jul 2025 04:02:45 +0100
Subject: [PATCH 150/167] [Doc] Syntax highlight request responses as JSON
 instead of bash (#20582)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/serving/openai_compatible_server.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 2d6e064a3fa8c..ffb58d9f60009 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -448,7 +448,7 @@ curl -v "http://127.0.0.1:8000/classify" \
 
 ??? console "Response"
 
-    ```bash
+    ```json
     {
       "id": "classify-7c87cac407b749a6935d8c7ce2a8fba2",
       "object": "list",
@@ -496,7 +496,7 @@ curl -v "http://127.0.0.1:8000/classify" \
 
 ??? console "Response"
 
-    ```bash
+    ```json
     {
       "id": "classify-9bf17f2847b046c7b2d5495f4b4f9682",
       "object": "list",
@@ -566,7 +566,7 @@ curl -X 'POST' \
 
 ??? console "Response"
 
-    ```bash
+    ```json
     {
       "id": "score-request-id",
       "object": "list",
@@ -608,7 +608,7 @@ The total number of pairs is `len(text_2)`.
 
 ??? console "Response"
 
-    ```bash
+    ```json
     {
       "id": "score-request-id",
       "object": "list",
@@ -657,7 +657,7 @@ The total number of pairs is `len(text_2)`.
 
 ??? console "Response"
 
-    ```bash
+    ```json
     {
       "id": "score-request-id",
       "object": "list",
@@ -736,7 +736,7 @@ Result documents will be sorted by relevance, and the `index` property can be us
 
 ??? console "Response"
 
-    ```bash
+    ```json
     {
       "id": "rerank-fae51b2b664d4ed38f5969b612edff77",
       "model": "BAAI/bge-reranker-base",

From 0d914c81a2688c616308ea591374dfccd1098750 Mon Sep 17 00:00:00 2001
From: Ricardo Decal <crypdick@users.noreply.github.com>
Date: Mon, 7 Jul 2025 20:06:02 -0700
Subject: [PATCH 151/167] [Docs] Rewrite offline inference guide (#20594)

Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
---
 docs/serving/offline_inference.md | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/docs/serving/offline_inference.md b/docs/serving/offline_inference.md
index b238199e41446..5b928500bd0f1 100644
--- a/docs/serving/offline_inference.md
+++ b/docs/serving/offline_inference.md
@@ -3,10 +3,7 @@ title: Offline Inference
 ---
 [](){ #offline-inference }
 
-You can run vLLM in your own code on a list of prompts.
-
-The offline API is based on the [LLM][vllm.LLM] class.
-To initialize the vLLM engine, create a new instance of `LLM` and specify the model to run.
+Offline inference is possible in your own code using vLLM's [`LLM`][vllm.LLM] class.
 
 For example, the following code downloads the [`facebook/opt-125m`](https://huggingface.co/facebook/opt-125m) model from HuggingFace
 and runs it in vLLM using the default configuration.
@@ -14,16 +11,30 @@ and runs it in vLLM using the default configuration.
 ```python
 from vllm import LLM
 
+# Initialize the vLLM engine.
 llm = LLM(model="facebook/opt-125m")
 ```
 
-After initializing the `LLM` instance, you can perform model inference using various APIs.
-The available APIs depend on the type of model that is being run:
+After initializing the `LLM` instance, use the available APIs to perform model inference.
+The available APIs depend on the model type:
 
 - [Generative models][generative-models] output logprobs which are sampled from to obtain the final output text.
 - [Pooling models][pooling-models] output their hidden states directly.
 
-Please refer to the above pages for more details about each API.
-
 !!! info
     [API Reference][offline-inference-api]
+
+### Ray Data LLM API
+
+Ray Data LLM is an alternative offline inference API that uses vLLM as the underlying engine.
+This API adds several batteries-included capabilities that simplify large-scale, GPU-efficient inference:
+
+- Streaming execution processes datasets that exceed aggregate cluster memory.
+- Automatic sharding, load balancing, and autoscaling distribute work across a Ray cluster with built-in fault tolerance.
+- Continuous batching keeps vLLM replicas saturated and maximizes GPU utilization.
+- Transparent support for tensor and pipeline parallelism enables efficient multi-GPU inference.
+
+The following example shows how to run batched inference with Ray Data and vLLM:
+<gh-file:examples/offline_inference/batch_llm_inference.py>
+
+For more information about the Ray Data LLM API, see the [Ray Data LLM documentation](https://docs.ray.io/en/latest/data/working-with-llms.html).

From e60d422f19f1f103307a759e4e5399ad93340cbe Mon Sep 17 00:00:00 2001
From: Ricardo Decal <crypdick@users.noreply.github.com>
Date: Mon, 7 Jul 2025 20:06:26 -0700
Subject: [PATCH 152/167] [Docs] Improve docstring for ray data llm example
 (#20597)

Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
---
 .../offline_inference/batch_llm_inference.py  | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/examples/offline_inference/batch_llm_inference.py b/examples/offline_inference/batch_llm_inference.py
index b1c1ef620da8d..22408dc95033d 100644
--- a/examples/offline_inference/batch_llm_inference.py
+++ b/examples/offline_inference/batch_llm_inference.py
@@ -3,17 +3,19 @@
 """
 This example shows how to use Ray Data for data parallel batch inference.
 
-Ray Data is a data processing framework that can handle large datasets
-and integrates tightly with vLLM for data-parallel inference.
-
-As of Ray 2.44, Ray Data has a native integration with
-vLLM (under ray.data.llm).
+Ray Data is a data processing framework that can process very large datasets
+with first-class support for vLLM.
 
 Ray Data provides functionality for:
-* Reading and writing to cloud storage (S3, GCS, etc.)
-* Automatic sharding and load-balancing across a cluster
-* Optimized configuration of vLLM using continuous batching
-* Compatible with tensor/pipeline parallel inference as well.
+* Reading and writing to most popular file formats and cloud object storage.
+* Streaming execution, so you can run inference on datasets that far exceed
+  the aggregate RAM of the cluster.
+* Scale up the workload without code changes.
+* Automatic sharding, load-balancing, and autoscaling across a Ray cluster,
+  with built-in fault-tolerance and retry semantics.
+* Continuous batching that keeps vLLM replicas saturated and maximizes GPU
+  utilization.
+* Compatible with tensor/pipeline parallel inference.
 
 Learn more about Ray Data's LLM integration:
 https://docs.ray.io/en/latest/data/working-with-llms.html

From 90a2769f2030bd11299f83871d7bc1c06db88cfb Mon Sep 17 00:00:00 2001
From: Ricardo Decal <crypdick@users.noreply.github.com>
Date: Mon, 7 Jul 2025 20:08:05 -0700
Subject: [PATCH 153/167] [Docs] Add Ray Serve LLM section to openai compatible
 server guide (#20595)

Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
---
 docs/serving/openai_compatible_server.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index ffb58d9f60009..82195ae82f153 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -775,3 +775,17 @@ The following extra parameters are supported:
 ```python
 --8<-- "vllm/entrypoints/openai/protocol.py:rerank-extra-params"
 ```
+
+## Ray Serve LLM
+
+Ray Serve LLM enables scalable, production-grade serving of the vLLM engine. It integrates tightly with vLLM and extends it with features such as auto-scaling, load balancing, and back-pressure.
+
+Key capabilities:
+
+- Exposes an OpenAI-compatible HTTP API as well as a Pythonic API.
+- Scales from a single GPU to a multi-node cluster without code changes.
+- Provides observability and autoscaling policies through Ray dashboards and metrics.
+
+The following example shows how to deploy a large model like DeepSeek R1 with Ray Serve LLM: <gh-file:examples/online_serving/ray_serve_deepseek.py>.
+
+Learn more about Ray Serve LLM with the official [Ray Serve LLM documentation](https://docs.ray.io/en/latest/serve/llm/serving-llms.html).

From 3eb4ad53f3e4ed0d3224c37efd08b4a1a42076fc Mon Sep 17 00:00:00 2001
From: Ricardo Decal <crypdick@users.noreply.github.com>
Date: Mon, 7 Jul 2025 20:09:13 -0700
Subject: [PATCH 154/167] [Docs] Add Anyscale to frameworks (#20590)

Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
---
 docs/deployment/frameworks/anyscale.md | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 docs/deployment/frameworks/anyscale.md

diff --git a/docs/deployment/frameworks/anyscale.md b/docs/deployment/frameworks/anyscale.md
new file mode 100644
index 0000000000000..2ee325782ac0c
--- /dev/null
+++ b/docs/deployment/frameworks/anyscale.md
@@ -0,0 +1,9 @@
+---
+title: Anyscale
+---
+[](){ #deployment-anyscale }
+
+[Anyscale](https://www.anyscale.com) is a managed, multi-cloud platform developed by the creators of Ray.
+It hosts Ray clusters inside your own AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
+without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, or managing observability stacks.
+When serving large language models with vLLM, Anyscale can rapidly provision [production-ready HTTPS endpoints](https://docs.anyscale.com/examples/deploy-ray-serve-llms) or [fault-tolerant batch inference jobs](https://docs.anyscale.com/examples/ray-data-llm).

From 8369b7c2a9524681b377085811bc176e332594c6 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Tue, 8 Jul 2025 12:45:18 +0800
Subject: [PATCH 155/167] [Misc] improve error msg (#20604)

Signed-off-by: reidliu41 <reid201711@gmail.com>
---
 vllm/entrypoints/cli/serve.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index 9e24b31e1aae6..d25105cbb789f 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -46,8 +46,10 @@ class ServeSubcommand(CLISubcommand):
             run_headless(args)
         else:
             if args.data_parallel_start_rank:
-                raise ValueError("data_parallel_start_rank is only "
-                                 "applicable in headless mode")
+                raise ValueError(
+                    "data_parallel_start_rank is only applicable "
+                    "in headless mode. "
+                    "Add --headless flag to enable headless mode.")
             if args.api_server_count > 1:
                 run_multi_api_server(args)
             else:
@@ -81,7 +83,8 @@ class ServeSubcommand(CLISubcommand):
             '-dpr',
             type=int,
             default=0,
-            help='Starting data parallel rank for secondary nodes.')
+            help="Starting data parallel rank for secondary nodes. "
+            "Requires --headless.")
         serve_parser.add_argument('--api-server-count',
                                   '-asc',
                                   type=int,

From 7721ef1786c49ec3738db5e61821183ee969d2a2 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Tue, 8 Jul 2025 13:13:44 +0800
Subject: [PATCH 156/167] [CI/Build][CPU] Fix CPU CI and remove all CPU V0
 files (#20560)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .../scripts/hardware_ci/run-cpu-test.sh       |  24 +-
 .../basic_correctness/test_chunked_prefill.py |  58 --
 .../models/language/generation/test_common.py |   8 +-
 .../models/language/pooling/test_embedding.py |  23 +-
 tests/models/language/pooling/test_reward.py  |   5 +
 tests/quantization/test_compressed_tensors.py |   3 +-
 vllm/attention/backends/torch_sdpa.py         | 546 -------------
 vllm/attention/ops/ipex_attn.py               | 195 -----
 vllm/v1/attention/backends/cpu_attn.py        | 762 +++++++++++++++++-
 9 files changed, 785 insertions(+), 839 deletions(-)
 delete mode 100644 vllm/attention/backends/torch_sdpa.py
 delete mode 100644 vllm/attention/ops/ipex_attn.py

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 737b2eede9c68..afe3e4b7ef691 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -48,10 +48,16 @@ function cpu_tests() {
   # Run basic model test
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
-    pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
-    pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
-    pytest -v -s tests/models/language/generation -m cpu_model
-    VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model
+    # Note: disable until supports V1
+    # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
+    # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
+
+    # Note: disable Bart until supports V1
+    pytest -v -s tests/models/language/generation -m cpu_model \
+                --ignore=tests/models/language/generation/test_bart.py
+    VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
+                --ignore=tests/models/language/generation/test_bart.py
+
     pytest -v -s tests/models/language/pooling -m cpu_model
     pytest -v -s tests/models/multimodal/generation \
                 --ignore=tests/models/multimodal/generation/test_mllama.py \
@@ -62,21 +68,15 @@ function cpu_tests() {
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" 
 
+  # Note: disable it until supports V1
   # Run AWQ test
   # docker exec cpu-test-"$NUMA_NODE" bash -c "
   #   set -e
   #   VLLM_USE_V1=0 pytest -s -v \
   #   tests/quantization/test_ipex_quant.py"
 
-  # Run chunked-prefill and prefix-cache test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    pytest -s -v -k cpu_model \
-    tests/basic_correctness/test_chunked_prefill.py"  
-
   # online serving
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index 4a422e8555da7..4816b76996fc8 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -294,61 +294,3 @@ def test_with_prefix_caching(
         name_0="w/o prefix caching",
         name_1="with prefix caching",
     )
-
-
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
-@pytest.mark.parametrize("dtype", ["bfloat16", "half"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
-@pytest.mark.parametrize("enforce_eager", [False])
-@pytest.mark.parametrize("attention_backend", ["TORCH_SDPA"])
-@pytest.mark.cpu_model
-@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
-def test_models_cpu(
-    hf_runner: HfRunner,
-    vllm_runner: VllmRunner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    chunked_prefill_token_size: int,
-    enforce_eager: bool,
-    attention_backend: str,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    test_models(
-        hf_runner,
-        vllm_runner,
-        example_prompts,
-        model,
-        dtype,
-        max_tokens,
-        chunked_prefill_token_size,
-        enforce_eager,
-        1,
-        attention_backend,
-        monkeypatch,
-    )
-
-
-@pytest.mark.parametrize("max_tokens", [16])
-@pytest.mark.parametrize("enforce_eager", [False])
-@pytest.mark.parametrize("chunk_size", [30, 32])
-@pytest.mark.parametrize("dtype", ["bfloat16", "half"])
-@pytest.mark.cpu_model
-@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
-def test_with_prefix_caching_cpu(
-    vllm_runner: VllmRunner,
-    max_tokens: int,
-    enforce_eager: bool,
-    chunk_size: int,
-    dtype: str,
-) -> None:
-    test_with_prefix_caching(
-        vllm_runner,
-        max_tokens,
-        enforce_eager,
-        chunk_size,
-        1,
-        dtype,
-    )
diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index 7d7a62eec118a..8aba68829b104 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -39,7 +39,7 @@ AITER_MODEL_LIST = [
     [
         pytest.param(
             "bigscience/bloom-560m",  # bloom - testing alibi slopes
-            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+            marks=[pytest.mark.core_model],
         ),
         pytest.param(
             "openai-community/gpt2",  # gpt2
@@ -87,7 +87,11 @@ AITER_MODEL_LIST = [
         pytest.param("bigcode/starcoder2-3b"),  # starcoder2
         pytest.param(
             "TitanML/tiny-mixtral",  # mixtral
-            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+            marks=[pytest.mark.core_model],
+        ),
+        pytest.param(
+            "Qwen/Qwen1.5-MoE-A2.7B-Chat",
+            marks=[pytest.mark.cpu_model],
         )
     ])
 @pytest.mark.parametrize("max_tokens", [32])
diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
index 05fcf4101ff9a..cc9e4102d5b79 100644
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
 from typing import Optional
 
 import pytest
@@ -29,8 +28,10 @@ def v1(run_with_both_engines):
         # [Decoder-only]
         pytest.param("BAAI/bge-multilingual-gemma2",
                      marks=[pytest.mark.core_model]),
-        pytest.param("intfloat/e5-mistral-7b-instruct",
-                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+        pytest.param(
+            "intfloat/e5-mistral-7b-instruct",
+            # CPU v1 doesn't support sliding window
+            marks=[pytest.mark.core_model]),
         # the qwen models interfere with each other (see PR
         # https://github.com/vllm-project/vllm/pull/18720).
         # To avoid this problem, for now we skip v0 since it will be
@@ -38,11 +39,13 @@ def v1(run_with_both_engines):
         pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
                      marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
         # [Encoder-only]
-        pytest.param("BAAI/bge-base-en-v1.5",
-                     marks=[
-                         pytest.mark.core_model, pytest.mark.cpu_model,
-                         pytest.mark.skip_v1
-                     ]),
+        pytest.param(
+            "BAAI/bge-base-en-v1.5",
+            marks=[
+                # CPU only supports V1
+                pytest.mark.core_model,
+                pytest.mark.skip_v1
+            ]),
         pytest.param("sentence-transformers/all-MiniLM-L12-v2",
                      marks=[pytest.mark.skip_v1]),
         pytest.param("intfloat/multilingual-e5-small",
@@ -61,10 +64,6 @@ def test_models(
     model,
     monkeypatch,
 ) -> None:
-    if model == "intfloat/e5-mistral-7b-instruct" and current_platform.is_cpu(
-    ) and os.environ.get("VLLM_USE_V1", "0") == "1":
-        pytest.skip("CPU V1 doesn't support sliding window")
-
     if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
         # ROCm Triton FA does not currently support sliding window attention
         # switch to use ROCm CK FA backend
diff --git a/tests/models/language/pooling/test_reward.py b/tests/models/language/pooling/test_reward.py
index ec3d25ee22a9e..3b7fab3ba5c99 100644
--- a/tests/models/language/pooling/test_reward.py
+++ b/tests/models/language/pooling/test_reward.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+
 import pytest
 import torch
 import torch.nn.functional as F
@@ -84,6 +86,9 @@ def test_prm_models(
     dtype: str,
     monkeypatch,
 ) -> None:
+    if current_platform.is_cpu() and os.environ.get("VLLM_USE_V1", "0") == "0":
+        pytest.skip("CPU only supports V1")
+
     if current_platform.is_rocm():
         # ROCm Triton FA does not currently support sliding window attention
         # switch to use ROCm CK FA backend
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 3646ad6c481b0..db7e50eff72bc 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -45,7 +45,8 @@ def use_v0_only(monkeypatch):
     """
     This module relies on V0 internals, so set VLLM_USE_V1=0.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    if not current_platform.is_cpu():
+        monkeypatch.setenv('VLLM_USE_V1', '0')
 
 
 @pytest.mark.parametrize(
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
deleted file mode 100644
index a490aa397991b..0000000000000
--- a/vllm/attention/backends/torch_sdpa.py
+++ /dev/null
@@ -1,546 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-""" Attention layer with torch scaled_dot_product_attention
-    and PagedAttention."""
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional
-
-import torch
-from torch.nn.functional import scaled_dot_product_attention
-
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.attention.backends.abstract import (AttentionImpl, AttentionLayer,
-                                              AttentionMetadata, AttentionType,
-                                              is_quantized_kv_cache)
-# yapf: enable
-from vllm.attention.ops.ipex_attn import PagedAttention, _use_ipex
-from vllm.attention.ops.paged_attn import PagedAttentionMetadata
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-@dataclass
-class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata):
-    """Metadata for TorchSDPABackend.
-    """
-    # Currently, input sequences can only contain all prompts
-    # or all decoding. True if all sequences are prompts.
-    chunked_prefill: bool
-    seq_lens: Optional[List[int]] = None  # For non-chunked prefill
-
-    # For chunked prefill only
-    max_query_len: Optional[int] = None
-    max_kv_len: Optional[int] = None
-    prefill_query_start_loc: Optional[torch.Tensor] = None
-    kv_start_loc: Optional[torch.Tensor] = None
-    prefill_block_tables: Optional[torch.Tensor] = None
-
-    # For V1 logits index only
-    query_start_loc: Optional[torch.Tensor] = None
-
-    # Begin encoder attn & enc/dec cross-attn fields...
-    # Encoder sequence lengths representation
-    encoder_seq_lens: Optional[List[int]] = None
-    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
-
-    # Maximum sequence length among encoder sequences
-    max_encoder_seq_len: Optional[int] = None
-
-    # Number of tokens input to encoder
-    num_encoder_tokens: Optional[int] = None
-
-    # Cross-attention memory-mapping data structures: slot mapping
-    # and block tables
-    cross_slot_mapping: Optional[torch.Tensor] = None
-    cross_block_tables: Optional[torch.Tensor] = None
-
-    def __post_init__(self):
-        # Set during the execution of the first attention op.
-        # It is a list because it is needed to set per prompt
-        # when alibi slopes is used. It is because of the limitation
-        # from xformer API.
-        # will not appear in the __repr__ and __init__
-        self.attn_bias: Optional[List[torch.Tensor]] = None
-        self.encoder_attn_bias: Optional[List[torch.Tensor]] = None
-        self.cross_attn_bias: Optional[List[torch.Tensor]] = None
-
-    @property
-    def is_all_encoder_attn_metadata_set(self):
-        '''
-        All attention metadata required for encoder attention is set.
-        '''
-        return ((self.encoder_seq_lens is not None)
-                and (self.encoder_seq_lens_tensor is not None)
-                and (self.max_encoder_seq_len is not None))
-
-    @property
-    def is_all_cross_attn_metadata_set(self):
-        '''
-        All attention metadata required for enc/dec cross-attention is set.
-
-        Superset of encoder attention required metadata.
-        '''
-        return (self.is_all_encoder_attn_metadata_set
-                and (self.cross_slot_mapping is not None)
-                and (self.cross_block_tables is not None))
-
-    @property
-    def prefill_metadata(self) -> Optional["TorchSDPAMetadata"]:
-        if self.num_prefill_tokens == 0:
-            return None
-        return self
-
-    @property
-    def decode_metadata(self) -> Optional["TorchSDPAMetadata"]:
-        if self.num_decode_tokens == 0:
-            return None
-        return self
-
-    def get_seq_lens(
-        self,
-        attn_type: str,
-    ):
-        '''
-        Extract appropriate sequence lengths from attention metadata
-        according to attention type.
-
-        Arguments:
-
-        * attn_metadata: Attention metadata structure associated with attention
-        * attn_type: encoder attention, decoder self-attention,
-                    encoder/decoder cross-attention
-
-        Returns:
-        * Appropriate sequence lengths tensor for query
-        * Appropriate sequence lengths tensor for key & value
-        '''
-
-        if (attn_type == AttentionType.DECODER
-                or attn_type == AttentionType.ENCODER_ONLY):
-            seq_lens_q = self.seq_lens
-            seq_lens_kv = self.seq_lens
-        elif attn_type == AttentionType.ENCODER:
-            seq_lens_q = self.encoder_seq_lens
-            seq_lens_kv = self.encoder_seq_lens
-        elif attn_type == AttentionType.ENCODER_DECODER:
-            seq_lens_q = self.seq_lens
-            seq_lens_kv = self.encoder_seq_lens
-        else:
-            raise AttributeError(f"Invalid attention type {str(attn_type)}")
-        return seq_lens_q, seq_lens_kv
-
-    def get_attn_bias(
-        self,
-        attn_type: str,
-    ) -> Optional[List[torch.Tensor]]:
-        '''
-        Extract appropriate attention bias from attention metadata
-        according to attention type.
-
-        Arguments:
-
-        * attn_metadata: Attention metadata structure associated with attention
-        * attn_type: encoder attention, decoder self-attention,
-                    encoder/decoder cross-attention
-
-        Returns:
-        * Appropriate attention bias value given the attention type
-        '''
-
-        if (attn_type == AttentionType.DECODER
-                or attn_type == AttentionType.ENCODER_ONLY):
-            return self.attn_bias
-        elif attn_type == AttentionType.ENCODER:
-            return self.encoder_attn_bias
-        elif attn_type == AttentionType.ENCODER_DECODER:
-            return self.cross_attn_bias
-        else:
-            raise AttributeError(f"Invalid attention type {str(attn_type)}")
-
-    def set_attn_bias(
-        self,
-        attn_bias: List[torch.Tensor],
-        attn_type: str,
-    ) -> None:
-        '''
-        Update appropriate attention bias field of attention metadata,
-        according to attention type.
-
-        Arguments:
-
-        * attn_metadata: Attention metadata structure associated with attention
-        * attn_bias: The desired attention bias value
-        * attn_type: encoder attention, decoder self-attention,
-                    encoder/decoder cross-attention
-        '''
-
-        if (attn_type == AttentionType.DECODER
-                or attn_type == AttentionType.ENCODER_ONLY):
-            self.attn_bias = attn_bias
-        elif attn_type == AttentionType.ENCODER:
-            self.encoder_attn_bias = attn_bias
-        elif attn_type == AttentionType.ENCODER_DECODER:
-            self.cross_attn_bias = attn_bias
-        else:
-            raise AttributeError(f"Invalid attention type {str(attn_type)}")
-
-    def get_seq_len_block_table_args(
-        self,
-        attn_type: str,
-    ) -> tuple:
-        '''
-        The particular choice of sequence-length- and block-table-related
-        attributes which should be extracted from attn_metadata is dependent
-        on the type of attention operation.
-
-        Decoder attn -> select entirely decoder self-attention-related fields
-        Encoder/decoder cross-attn -> select encoder sequence lengths &
-                                    cross-attn block-tables fields
-        Encoder attn -> select encoder sequence lengths fields & no block tables
-
-        Arguments:
-
-        * attn_metadata: Attention metadata structure associated with attention
-        * is_prompt: True if prefill, False otherwise
-        * attn_type: encoder attention, decoder self-attention,
-                    encoder/decoder cross-attention
-
-        Returns:
-
-        * Appropriate sequence-lengths tensor
-        * Appropriate max sequence-length scalar
-        * Appropriate block tables (or None)
-        '''
-
-        if (attn_type == AttentionType.DECODER
-                or attn_type == AttentionType.ENCODER_ONLY):
-            # Decoder self-attention
-            # Choose max_seq_len based on whether we are in prompt_run
-            return (self.seq_lens_tensor, self.max_decode_seq_len,
-                    self.block_tables)
-        elif attn_type == AttentionType.ENCODER_DECODER:
-            # Enc/dec cross-attention KVs match encoder sequence length;
-            # cross-attention utilizes special "cross" block tables
-            return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len,
-                    self.cross_block_tables)
-        elif attn_type == AttentionType.ENCODER:
-            # No block tables associated with encoder attention
-            return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len,
-                    None)
-        else:
-            raise AttributeError(f"Invalid attention type {str(attn_type)}")
-
-
-class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        blocksparse_params: Optional[Dict[str, Any]] = None,
-        logits_soft_cap: Optional[float] = None,
-        attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        use_irope: bool = False,
-    ) -> None:
-        if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0.")
-        if blocksparse_params is not None:
-            raise ValueError(
-                "Torch SPDA does not support block-sparse attention.")
-        if logits_soft_cap is not None:
-            logger.warning_once("Torch SPDA does not support logits soft cap. "
-                                "Outputs may be slightly off.")
-        if use_irope:
-            logger.warning_once(
-                "Using irope in Torch SPDA is not supported yet, it will fall"
-                " back to global attention for long context.")
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.num_kv_heads = num_kv_heads
-        if alibi_slopes is not None:
-            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
-        self.alibi_slopes = alibi_slopes
-        self.sliding_window = sliding_window
-        self.kv_cache_dtype = kv_cache_dtype
-
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-        self.need_mask = (self.alibi_slopes is not None
-                          or self.sliding_window is not None)
-
-        supported_head_sizes = PagedAttention.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            raise ValueError(
-                f"Head size {head_size} is not supported by PagedAttention. "
-                f"Supported head sizes are: {supported_head_sizes}.")
-
-        if is_quantized_kv_cache(kv_cache_dtype) and not _use_ipex:
-            raise NotImplementedError(
-                "Torch SDPA backend FP8 KV cache requires "
-                "intel_extension_for_pytorch support.")
-        self.attn_type = attn_type
-
-    def forward(
-        self,
-        layer: AttentionLayer,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: TorchSDPAMetadata,  # type: ignore
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """Forward pass with torch SDPA and PagedAttention.
-
-        Args:
-            query: shape = [num_tokens, num_heads * head_size]
-            key: shape = [num_tokens, num_kv_heads * head_size]
-            value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
-                NOTE: kv_cache will be an empty tensor with shape [0]
-                for profiling run.
-            attn_metadata: Metadata for attention.
-        Returns:
-            shape = [num_tokens, num_heads * head_size]
-        """
-        if output_scale is not None:
-            raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for TorchSDPABackendImpl")
-
-        # For warming-up
-        if attn_metadata is None:
-            return query
-
-        attn_type = self.attn_type
-        if (attn_type == AttentionType.ENCODER
-                and (not attn_metadata.is_all_encoder_attn_metadata_set)):
-            raise AttributeError("Encoder attention requires setting "
-                                 "encoder metadata attributes.")
-        elif (attn_type == AttentionType.ENCODER_DECODER
-              and (not attn_metadata.is_all_cross_attn_metadata_set)):
-            raise AttributeError("Encoder/decoder cross-attention "
-                                 "requires setting cross-attention "
-                                 "metadata attributes.")
-
-        # Reshape the query, key, and value tensors.
-        query = query.view(-1, self.num_heads, self.head_size)
-        if key is not None:
-            assert value is not None
-            key = key.view(-1, self.num_kv_heads, self.head_size)
-            value = value.view(-1, self.num_kv_heads, self.head_size)
-        else:
-            assert value is None
-
-        if (attn_type != AttentionType.ENCODER and kv_cache.numel() > 0):
-            # KV-cache during decoder-self- or
-            # encoder-decoder-cross-attention, but not
-            # during encoder attention.
-            #
-            # Even if there are no new key/value pairs to cache,
-            # we still need to break out key_cache and value_cache
-            # i.e. for later use by paged attention
-            key_cache, value_cache = PagedAttention.split_kv_cache(
-                kv_cache, self.num_kv_heads, self.head_size)
-
-            if (key is not None) and (value is not None):
-                if attn_type == AttentionType.ENCODER_DECODER:
-                    # Update cross-attention KV cache (prefill-only)
-                    # During cross-attention decode, key & value will be None,
-                    # preventing this IF-statement branch from running
-                    updated_slot_mapping = attn_metadata.cross_slot_mapping
-                else:
-                    # Update self-attention KV cache (prefill/decode)
-                    updated_slot_mapping = attn_metadata.slot_mapping
-
-                PagedAttention.write_to_paged_cache(
-                    key, value, key_cache, value_cache, updated_slot_mapping,
-                    self.kv_cache_dtype, layer._k_scale, layer._v_scale)
-
-        if attn_type != AttentionType.ENCODER:
-            # Decoder self-attention supports chunked prefill.
-            # Encoder/decoder cross-attention requires no chunked
-            # prefill (100% prefill or 100% decode tokens, no mix)
-            num_prefill_tokens = attn_metadata.num_prefill_tokens
-            num_decode_tokens = attn_metadata.num_decode_tokens
-        else:
-            # Encoder attention - chunked prefill is not applicable;
-            # derive token-count from query shape & and treat them
-            # as 100% prefill tokens
-            assert attn_metadata.num_encoder_tokens is not None
-            num_prefill_tokens = attn_metadata.num_encoder_tokens
-            num_decode_tokens = 0
-
-        if attn_type == AttentionType.DECODER:
-            # Only enforce this shape-constraint for decoder
-            # self-attention
-            assert key.shape[0] == num_prefill_tokens + num_decode_tokens
-            assert value.shape[0] == num_prefill_tokens + num_decode_tokens
-
-        output = torch.empty_like(query)
-        if prefill_meta := attn_metadata.prefill_metadata:
-            if not prefill_meta.prefill_metadata.chunked_prefill:  # type: ignore
-                assert attn_metadata.seq_lens is not None
-                self._run_sdpa_forward(output,
-                                       query,
-                                       key,
-                                       value,
-                                       prefill_meta,
-                                       attn_type=attn_type)
-            else:
-                # prefix-enabled attention
-                assert not self.need_mask
-                import intel_extension_for_pytorch.llm.modules as ipex_modules
-                output = torch.empty_like(query)
-                ipex_modules.PagedAttention.flash_attn_varlen_func(
-                    output[:prefill_meta.num_prefill_tokens, :, :],
-                    query[:prefill_meta.num_prefill_tokens, :, :],
-                    key_cache,
-                    value_cache,
-                    prefill_meta.prefill_query_start_loc,
-                    prefill_meta.kv_start_loc,
-                    prefill_meta.max_query_len,
-                    prefill_meta.max_kv_len,
-                    self.scale,
-                    True,
-                    prefill_meta.prefill_block_tables,
-                    self.alibi_slopes,
-                )
-
-        if decode_meta := attn_metadata.decode_metadata:
-            assert attn_type != AttentionType.ENCODER_ONLY, (
-                "Encoder-only models should not have decode metadata.")
-            # Decoding run.
-            (
-                seq_lens_arg,
-                max_seq_len_arg,
-                block_tables_arg,
-            ) = decode_meta.get_seq_len_block_table_args(attn_type)
-
-            PagedAttention.forward_decode(
-                output[attn_metadata.num_prefill_tokens:, :, :],
-                query[attn_metadata.num_prefill_tokens:, :, :],
-                key_cache,
-                value_cache,
-                block_tables_arg,
-                seq_lens_arg,
-                max_seq_len_arg,
-                self.kv_cache_dtype,
-                self.num_kv_heads,
-                self.scale,
-                self.alibi_slopes,
-                layer._k_scale,
-                layer._v_scale,
-            )
-
-        # Reshape the output tensor.
-        return output.view(-1, self.num_heads * self.head_size)
-
-    def _run_sdpa_forward(
-        self,
-        output: torch.Tensor,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        attn_metadata: TorchSDPAMetadata,
-        attn_type: str = AttentionType.DECODER,
-    ) -> None:
-        if self.num_kv_heads != self.num_heads:
-            key = key.repeat_interleave(self.num_queries_per_kv, dim=1)
-            value = value.repeat_interleave(self.num_queries_per_kv, dim=1)
-
-        attn_masks = attn_metadata.get_attn_bias(attn_type)
-        if attn_masks is None:
-            if self.alibi_slopes is not None:
-                attn_masks = _make_alibi_bias(
-                    self.alibi_slopes, query.dtype,
-                    attn_metadata.seq_lens)  # type: ignore
-            elif self.sliding_window is not None:
-                assert attn_metadata.seq_lens is not None
-                attn_masks = _make_sliding_window_bias(
-                    attn_metadata.seq_lens, self.sliding_window,
-                    query.dtype)  # type: ignore
-            else:
-                seq_lens, _ = attn_metadata.get_seq_lens(attn_type)
-                attn_masks = [None] * len(seq_lens)
-            attn_metadata.set_attn_bias(attn_masks, attn_type)
-
-        query = query.movedim(0, query.dim() - 2)
-        key = key.movedim(0, key.dim() - 2)
-        value = value.movedim(0, value.dim() - 2)
-
-        causal_attn = (attn_type == AttentionType.DECODER)
-
-        seq_lens_q, seq_lens_kv = attn_metadata.get_seq_lens(attn_type)
-        start_q, start_kv = 0, 0
-        for seq_len_q, seq_len_kv, mask in zip(seq_lens_q, seq_lens_kv,
-                                               attn_masks):
-            end_q = start_q + seq_len_q
-            end_kv = start_kv + seq_len_kv
-            sub_out = scaled_dot_product_attention(
-                query[None, :, start_q:end_q, :],
-                key[None, :, start_kv:end_kv, :],
-                value[None, :, start_kv:end_kv, :],
-                attn_mask=mask,
-                dropout_p=0.0,
-                is_causal=causal_attn and mask is None,
-                scale=self.scale).squeeze(0).movedim(query.dim() - 2, 0)
-            output[start_q:end_q, :, :] = sub_out
-            start_q, start_kv = end_q, end_kv
-
-
-def _make_alibi_bias(
-    alibi_slopes: torch.Tensor,
-    dtype: torch.dtype,
-    seq_lens: List[int],
-) -> List[torch.Tensor]:
-    attn_biases: List[torch.Tensor] = []
-    for seq_len in seq_lens:
-        bias = torch.arange(seq_len, dtype=dtype)
-        # NOTE(zhuohan): HF uses
-        #     `bias = bias[None, :].repeat(seq_len, 1)`
-        # here. We find that both biases give the same results, but
-        # the bias below more accurately follows the original ALiBi
-        # paper.
-        bias = bias[None, :] - bias[:, None]
-
-        num_heads = alibi_slopes.shape[0]
-        bias = bias[None, :].repeat((num_heads, 1, 1))
-        bias.mul_(alibi_slopes[:, None, None]).unsqueeze_(0)
-        inf_mask = torch.empty(
-            (1, seq_len, seq_len),
-            dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1)
-        attn_biases.append((bias + inf_mask).to(dtype))
-
-    return attn_biases
-
-
-def _make_sliding_window_bias(
-    seq_lens: List[int],
-    window_size: Optional[int],
-    dtype: torch.dtype,
-) -> List[torch.Tensor]:
-    attn_biases: List[torch.Tensor] = []
-    for seq_len in seq_lens:
-        tensor = torch.full(
-            (1, seq_len, seq_len),
-            dtype=dtype,
-            fill_value=1,
-        )
-        shift = 0
-        mask = torch.tril(tensor, diagonal=shift).to(dtype)  # type: ignore
-        if window_size is not None:
-            mask = torch.triu(mask, diagonal=shift - window_size + 1)
-        mask = torch.log(mask)
-        attn_biases.append(mask.to(dtype))
-
-    return attn_biases
diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py
deleted file mode 100644
index 8919754989165..0000000000000
--- a/vllm/attention/ops/ipex_attn.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import List, Optional, Tuple
-
-try:
-    import intel_extension_for_pytorch.llm.modules as ipex_modules
-    _use_ipex = True
-# AttributeError is to handle a bug in ipex https://github.com/intel/intel-extension-for-pytorch/pull/813
-except (ImportError, AttributeError):
-    _use_ipex = False
-
-import torch
-
-from vllm import _custom_ops as ops
-
-
-class _PagedAttention:
-
-    @staticmethod
-    def get_supported_head_sizes() -> List[int]:
-        return [32, 64, 80, 96, 112, 128, 192, 256]
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-        *args,
-    ) -> Tuple[int, ...]:
-        return 2, num_blocks, block_size * num_kv_heads * head_size
-
-    @staticmethod
-    def split_kv_cache(
-        kv_cache: torch.Tensor,
-        num_kv_heads: int,
-        head_size: int,
-        *args,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        x = 16 // kv_cache.element_size()
-        num_blocks = kv_cache.shape[1]
-
-        key_cache = kv_cache[0]
-        key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x,
-                                   -1, x)
-        value_cache = kv_cache[1]
-        value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1)
-        return key_cache, value_cache
-
-    @staticmethod
-    def write_to_paged_cache(
-        key: torch.Tensor,
-        value: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        slot_mapping: torch.Tensor,
-        kv_cache_dtype: str,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        *args,
-    ) -> None:
-        ops.reshape_and_cache(
-            key,
-            value,
-            key_cache,
-            value_cache,
-            slot_mapping.flatten(),
-            kv_cache_dtype,
-            k_scale,
-            v_scale,
-        )
-
-    @staticmethod
-    def forward_decode(
-        output: torch.Tensor,
-        query: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        block_tables: torch.Tensor,
-        context_lens: torch.Tensor,
-        max_context_len: int,
-        kv_cache_dtype: str,
-        num_kv_heads: int,
-        scale: float,
-        alibi_slopes: Optional[torch.Tensor],
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        *args,
-    ) -> None:
-        tp_rank: int = 0
-        blocksparse_local_blocks: int = 0
-        blocksparse_vert_stride: int = 0
-        blocksparse_block_size: int = 64
-        blocksparse_head_sliding_step: int = 0
-        block_size = value_cache.shape[3]
-
-        ops.paged_attention_v1(
-            output,
-            query,
-            key_cache,
-            value_cache,
-            num_kv_heads,
-            scale,
-            block_tables,
-            context_lens,
-            block_size,
-            max_context_len,
-            alibi_slopes,
-            kv_cache_dtype,
-            k_scale,
-            v_scale,
-            tp_rank,
-            blocksparse_local_blocks,
-            blocksparse_vert_stride,
-            blocksparse_block_size,
-            blocksparse_head_sliding_step,
-        )
-
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[torch.Tensor],
-        src_to_dists: torch.Tensor,
-        *args,
-    ) -> None:
-        key_caches = [kv_cache[0] for kv_cache in kv_caches]
-        value_caches = [kv_cache[1] for kv_cache in kv_caches]
-        ops.copy_blocks(key_caches, value_caches, src_to_dists)
-
-
-class _IPEXPagedAttention(_PagedAttention):
-
-    @staticmethod
-    def split_kv_cache(
-        kv_cache: torch.Tensor,
-        num_kv_heads: int,
-        head_size: int,
-        *args,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        num_blocks = kv_cache.shape[1]
-
-        key_cache = kv_cache[0]
-        key_cache = key_cache.view(num_blocks, num_kv_heads, -1, head_size)
-        value_cache = kv_cache[1]
-        value_cache = value_cache.view(num_blocks, num_kv_heads, -1, head_size)
-        return key_cache, value_cache
-
-    @staticmethod
-    def write_to_paged_cache(
-        key: torch.Tensor,
-        value: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        slot_mapping: torch.Tensor,
-        kv_cache_dtype: str,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        *args,
-    ) -> None:
-        ipex_modules.PagedAttention.reshape_and_cache(
-            key, value, key_cache, value_cache,
-            slot_mapping.flatten().int())
-
-    @staticmethod
-    def forward_decode(
-        output: torch.Tensor,
-        query: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        block_tables: torch.Tensor,
-        context_lens: torch.Tensor,
-        max_context_len: int,
-        kv_cache_dtype: str,
-        num_kv_heads: int,
-        scale: float,
-        alibi_slopes: Optional[torch.Tensor],
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        *args,
-    ) -> None:
-        block_size = value_cache.shape[2]
-        head_mapping = torch.arange(
-            0,
-            num_kv_heads,
-            device="cpu",
-            dtype=torch.int32,
-        ).view(num_kv_heads,
-               1).repeat_interleave(query.size(1) // num_kv_heads).flatten()
-        ipex_modules.PagedAttention.single_query_cached_kv_attention(
-            output, query.contiguous(), key_cache, value_cache, head_mapping,
-            scale, block_tables, context_lens, block_size, max_context_len,
-            alibi_slopes)
-
-
-PagedAttention = _IPEXPagedAttention if _use_ipex else _PagedAttention
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index 37c04c7a029e5..08e802958b69c 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -1,14 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+from typing import Any, Optional
+
 import numpy as np
 import torch
+from torch.nn.functional import scaled_dot_product_attention
 
-from vllm.attention.backends.abstract import (AttentionBackend,
-                                              AttentionMetadata)
-from vllm.attention.backends.torch_sdpa import (TorchSDPABackendImpl,
-                                                TorchSDPAMetadata)
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
+                                              AttentionMetadata, AttentionType,
+                                              is_quantized_kv_cache)
 from vllm.attention.backends.utils import CommonAttentionState
-from vllm.attention.ops.ipex_attn import PagedAttention
+from vllm.logger import init_logger
 from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
                                               CommonAttentionMetadata)
 from vllm.v1.core.sched.output import SchedulerOutput
@@ -17,18 +21,28 @@ from vllm.v1.worker.block_table import BlockTable
 from vllm.v1.worker.cpu_model_runner import CPUModelRunner
 from vllm.v1.worker.gpu_input_batch import InputBatch
 
+try:
+    import intel_extension_for_pytorch.llm.modules as ipex_modules
+    _use_ipex = True
+# AttributeError is to handle a bug in ipex
+# https://github.com/intel/intel-extension-for-pytorch/pull/813
+except (ImportError, AttributeError):
+    _use_ipex = False
+
+from vllm import _custom_ops as ops
+
+logger = init_logger(__name__)
+
 
 class TorchSDPABackend(AttentionBackend):
     accept_output_buffer: bool = False
 
-    @classmethod
-    def get_supported_head_sizes(cls) -> list[int]:
-        return PagedAttention.get_supported_head_sizes()
-
     @classmethod
     def validate_head_size(cls, head_size: int) -> None:
-        supported_head_sizes = cls.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
+        attn_impl = _get_paged_attn_impl()
+        is_valid, supported_head_sizes = attn_impl.validate_head_size(
+            head_size)
+        if not is_valid:
             attn_type = cls.__name__.removesuffix("Backend")
             raise ValueError(
                 f"Head size {head_size} is not supported by {attn_type}. "
@@ -63,14 +77,239 @@ class TorchSDPABackend(AttentionBackend):
         num_kv_heads: int,
         head_size: int,
     ) -> tuple[int, ...]:
-        return PagedAttention.get_kv_cache_shape(num_blocks, block_size,
-                                                 num_kv_heads, head_size)
+        return _get_paged_attn_impl().get_kv_cache_shape(
+            num_blocks, block_size, num_kv_heads, head_size)
 
     @staticmethod
     def use_cascade_attention(*args, **kwargs) -> bool:
         return False
 
 
+@dataclass
+class TorchSDPAMetadata(AttentionMetadata):
+    """Metadata for PagedAttention."""
+    # (batch_size,). The length of sequences (entire tokens seen so far) per
+    # sequence.
+    seq_lens_tensor: Optional[torch.Tensor]
+    # Maximum sequence length in the batch. 0 if it is prefill-only batch.
+    max_decode_seq_len: int
+    # (batch_size, max_blocks_per_seq).
+    # Block addresses per sequence. (Seq id -> list of physical block)
+    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
+    # in the kv cache. Each block can contain up to block_size tokens.
+    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
+    # captured.
+    block_tables: Optional[torch.Tensor]
+    """Metadata for TorchSDPABackend.
+    """
+    # Currently, input sequences can only contain all prompts
+    # or all decoding. True if all sequences are prompts.
+    chunked_prefill: bool
+    seq_lens: Optional[list[int]] = None  # For non-chunked prefill
+
+    # For chunked prefill only
+    max_query_len: Optional[int] = None
+    max_kv_len: Optional[int] = None
+    prefill_query_start_loc: Optional[torch.Tensor] = None
+    kv_start_loc: Optional[torch.Tensor] = None
+    prefill_block_tables: Optional[torch.Tensor] = None
+
+    # For V1 logits index only
+    query_start_loc: Optional[torch.Tensor] = None
+
+    # Begin encoder attn & enc/dec cross-attn fields...
+    # Encoder sequence lengths representation
+    encoder_seq_lens: Optional[list[int]] = None
+    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
+
+    # Maximum sequence length among encoder sequences
+    max_encoder_seq_len: Optional[int] = None
+
+    # Number of tokens input to encoder
+    num_encoder_tokens: Optional[int] = None
+
+    # Cross-attention memory-mapping data structures: slot mapping
+    # and block tables
+    cross_slot_mapping: Optional[torch.Tensor] = None
+    cross_block_tables: Optional[torch.Tensor] = None
+
+    def __post_init__(self):
+        # Set during the execution of the first attention op.
+        # It is a list because it is needed to set per prompt
+        # when alibi slopes is used. It is because of the limitation
+        # from xformer API.
+        # will not appear in the __repr__ and __init__
+        self.attn_bias: Optional[list[torch.Tensor]] = None
+        self.encoder_attn_bias: Optional[list[torch.Tensor]] = None
+        self.cross_attn_bias: Optional[list[torch.Tensor]] = None
+
+    @property
+    def is_all_encoder_attn_metadata_set(self):
+        '''
+        All attention metadata required for encoder attention is set.
+        '''
+        return ((self.encoder_seq_lens is not None)
+                and (self.encoder_seq_lens_tensor is not None)
+                and (self.max_encoder_seq_len is not None))
+
+    @property
+    def is_all_cross_attn_metadata_set(self):
+        '''
+        All attention metadata required for enc/dec cross-attention is set.
+
+        Superset of encoder attention required metadata.
+        '''
+        return (self.is_all_encoder_attn_metadata_set
+                and (self.cross_slot_mapping is not None)
+                and (self.cross_block_tables is not None))
+
+    @property
+    def prefill_metadata(self) -> Optional["TorchSDPAMetadata"]:
+        if self.num_prefill_tokens == 0:
+            return None
+        return self
+
+    @property
+    def decode_metadata(self) -> Optional["TorchSDPAMetadata"]:
+        if self.num_decode_tokens == 0:
+            return None
+        return self
+
+    def get_seq_lens(
+        self,
+        attn_type: str,
+    ):
+        '''
+        Extract appropriate sequence lengths from attention metadata
+        according to attention type.
+
+        Arguments:
+
+        * attn_metadata: Attention metadata structure associated with attention
+        * attn_type: encoder attention, decoder self-attention,
+                    encoder/decoder cross-attention
+
+        Returns:
+        * Appropriate sequence lengths tensor for query
+        * Appropriate sequence lengths tensor for key & value
+        '''
+
+        if (attn_type == AttentionType.DECODER
+                or attn_type == AttentionType.ENCODER_ONLY):
+            seq_lens_q = self.seq_lens
+            seq_lens_kv = self.seq_lens
+        elif attn_type == AttentionType.ENCODER:
+            seq_lens_q = self.encoder_seq_lens
+            seq_lens_kv = self.encoder_seq_lens
+        elif attn_type == AttentionType.ENCODER_DECODER:
+            seq_lens_q = self.seq_lens
+            seq_lens_kv = self.encoder_seq_lens
+        else:
+            raise AttributeError(f"Invalid attention type {str(attn_type)}")
+        return seq_lens_q, seq_lens_kv
+
+    def get_attn_bias(
+        self,
+        attn_type: str,
+    ) -> Optional[list[torch.Tensor]]:
+        '''
+        Extract appropriate attention bias from attention metadata
+        according to attention type.
+
+        Arguments:
+
+        * attn_metadata: Attention metadata structure associated with attention
+        * attn_type: encoder attention, decoder self-attention,
+                    encoder/decoder cross-attention
+
+        Returns:
+        * Appropriate attention bias value given the attention type
+        '''
+
+        if (attn_type == AttentionType.DECODER
+                or attn_type == AttentionType.ENCODER_ONLY):
+            return self.attn_bias
+        elif attn_type == AttentionType.ENCODER:
+            return self.encoder_attn_bias
+        elif attn_type == AttentionType.ENCODER_DECODER:
+            return self.cross_attn_bias
+        else:
+            raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+    def set_attn_bias(
+        self,
+        attn_bias: list[torch.Tensor],
+        attn_type: str,
+    ) -> None:
+        '''
+        Update appropriate attention bias field of attention metadata,
+        according to attention type.
+
+        Arguments:
+
+        * attn_metadata: Attention metadata structure associated with attention
+        * attn_bias: The desired attention bias value
+        * attn_type: encoder attention, decoder self-attention,
+                    encoder/decoder cross-attention
+        '''
+
+        if (attn_type == AttentionType.DECODER
+                or attn_type == AttentionType.ENCODER_ONLY):
+            self.attn_bias = attn_bias
+        elif attn_type == AttentionType.ENCODER:
+            self.encoder_attn_bias = attn_bias
+        elif attn_type == AttentionType.ENCODER_DECODER:
+            self.cross_attn_bias = attn_bias
+        else:
+            raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+    def get_seq_len_block_table_args(
+        self,
+        attn_type: str,
+    ) -> tuple:
+        '''
+        The particular choice of sequence-length- and block-table-related
+        attributes which should be extracted from attn_metadata is dependent
+        on the type of attention operation.
+
+        Decoder attn -> select entirely decoder self-attention-related fields
+        Encoder/decoder cross-attn -> select encoder sequence lengths &
+                                    cross-attn block-tables fields
+        Encoder attn -> select encoder sequence lengths fields & no block tables
+
+        Arguments:
+
+        * attn_metadata: Attention metadata structure associated with attention
+        * is_prompt: True if prefill, False otherwise
+        * attn_type: encoder attention, decoder self-attention,
+                    encoder/decoder cross-attention
+
+        Returns:
+
+        * Appropriate sequence-lengths tensor
+        * Appropriate max sequence-length scalar
+        * Appropriate block tables (or None)
+        '''
+
+        if (attn_type == AttentionType.DECODER
+                or attn_type == AttentionType.ENCODER_ONLY):
+            # Decoder self-attention
+            # Choose max_seq_len based on whether we are in prompt_run
+            return (self.seq_lens_tensor, self.max_decode_seq_len,
+                    self.block_tables)
+        elif attn_type == AttentionType.ENCODER_DECODER:
+            # Enc/dec cross-attention KVs match encoder sequence length;
+            # cross-attention utilizes special "cross" block tables
+            return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len,
+                    self.cross_block_tables)
+        elif attn_type == AttentionType.ENCODER:
+            # No block tables associated with encoder attention
+            return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len,
+                    None)
+        else:
+            raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+
 class TorchSDPAMetadataBuilderV1(AttentionMetadataBuilder[TorchSDPAMetadata]):
 
     def __init__(self, runner: CPUModelRunner, kv_cache_spec: AttentionSpec,
@@ -182,3 +421,500 @@ class TorchSDPAMetadataBuilderV1(AttentionMetadataBuilder[TorchSDPAMetadata]):
         )
 
         return attn_metadata
+
+
+class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[list[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
+        use_irope: bool = False,
+    ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
+        if blocksparse_params is not None:
+            raise ValueError(
+                "Torch SPDA does not support block-sparse attention.")
+        if logits_soft_cap is not None:
+            logger.warning_once("Torch SPDA does not support logits soft cap. "
+                                "Outputs may be slightly off.")
+        if use_irope:
+            logger.warning_once(
+                "Using irope in Torch SPDA is not supported yet, it will fall"
+                " back to global attention for long context.")
+        self.paged_attn_impl = _get_paged_attn_impl()
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+        self.sliding_window = sliding_window
+        self.kv_cache_dtype = kv_cache_dtype
+
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+        self.need_mask = (self.alibi_slopes is not None
+                          or self.sliding_window is not None)
+
+        if is_quantized_kv_cache(kv_cache_dtype) and not _use_ipex:
+            raise NotImplementedError(
+                "Torch SDPA backend FP8 KV cache requires "
+                "intel_extension_for_pytorch support.")
+        self.attn_type = attn_type
+
+    def forward(
+        self,
+        layer: AttentionLayer,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: TorchSDPAMetadata,  # type: ignore
+        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with torch SDPA and PagedAttention.
+
+        Args:
+            query: shape = [num_tokens, num_heads * head_size]
+            key: shape = [num_tokens, num_kv_heads * head_size]
+            value: shape = [num_tokens, num_kv_heads * head_size]
+            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
+                NOTE: kv_cache will be an empty tensor with shape [0]
+                for profiling run.
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for TorchSDPABackendImpl")
+
+        # For warming-up
+        if attn_metadata is None:
+            return query
+
+        attn_type = self.attn_type
+        if (attn_type == AttentionType.ENCODER
+                and (not attn_metadata.is_all_encoder_attn_metadata_set)):
+            raise AttributeError("Encoder attention requires setting "
+                                 "encoder metadata attributes.")
+        elif (attn_type == AttentionType.ENCODER_DECODER
+              and (not attn_metadata.is_all_cross_attn_metadata_set)):
+            raise AttributeError("Encoder/decoder cross-attention "
+                                 "requires setting cross-attention "
+                                 "metadata attributes.")
+
+        # Reshape the query, key, and value tensors.
+        query = query.view(-1, self.num_heads, self.head_size)
+        if key is not None:
+            assert value is not None
+            key = key.view(-1, self.num_kv_heads, self.head_size)
+            value = value.view(-1, self.num_kv_heads, self.head_size)
+        else:
+            assert value is None
+
+        if (attn_type != AttentionType.ENCODER and kv_cache.numel() > 0):
+            # KV-cache during decoder-self- or
+            # encoder-decoder-cross-attention, but not
+            # during encoder attention.
+            #
+            # Even if there are no new key/value pairs to cache,
+            # we still need to break out key_cache and value_cache
+            # i.e. for later use by paged attention
+            key_cache, value_cache = self.paged_attn_impl.split_kv_cache(
+                kv_cache, self.num_kv_heads, self.head_size)
+
+            if (key is not None) and (value is not None):
+                if attn_type == AttentionType.ENCODER_DECODER:
+                    # Update cross-attention KV cache (prefill-only)
+                    # During cross-attention decode, key & value will be None,
+                    # preventing this IF-statement branch from running
+                    updated_slot_mapping = attn_metadata.cross_slot_mapping
+                else:
+                    # Update self-attention KV cache (prefill/decode)
+                    updated_slot_mapping = attn_metadata.slot_mapping
+
+                self.paged_attn_impl.write_to_paged_cache(
+                    key, value, key_cache, value_cache, updated_slot_mapping,
+                    self.kv_cache_dtype, layer._k_scale, layer._v_scale)
+
+        if attn_type != AttentionType.ENCODER:
+            # Decoder self-attention supports chunked prefill.
+            # Encoder/decoder cross-attention requires no chunked
+            # prefill (100% prefill or 100% decode tokens, no mix)
+            num_prefill_tokens = attn_metadata.num_prefill_tokens
+            num_decode_tokens = attn_metadata.num_decode_tokens
+        else:
+            # Encoder attention - chunked prefill is not applicable;
+            # derive token-count from query shape & and treat them
+            # as 100% prefill tokens
+            assert attn_metadata.num_encoder_tokens is not None
+            num_prefill_tokens = attn_metadata.num_encoder_tokens
+            num_decode_tokens = 0
+
+        if attn_type == AttentionType.DECODER:
+            # Only enforce this shape-constraint for decoder
+            # self-attention
+            assert key.shape[0] == num_prefill_tokens + num_decode_tokens
+            assert value.shape[0] == num_prefill_tokens + num_decode_tokens
+
+        output = torch.empty_like(query)
+        if prefill_meta := attn_metadata.prefill_metadata:
+            if not prefill_meta.prefill_metadata.chunked_prefill:  # type: ignore
+                assert attn_metadata.seq_lens is not None
+                self._run_sdpa_forward(output,
+                                       query,
+                                       key,
+                                       value,
+                                       prefill_meta,
+                                       attn_type=attn_type)
+            else:
+                # prefix-enabled attention
+                assert not self.need_mask
+                import intel_extension_for_pytorch.llm.modules as ipex_modules
+                output = torch.empty_like(query)
+                ipex_modules.PagedAttention.flash_attn_varlen_func(
+                    output[:prefill_meta.num_prefill_tokens, :, :],
+                    query[:prefill_meta.num_prefill_tokens, :, :],
+                    key_cache,
+                    value_cache,
+                    prefill_meta.prefill_query_start_loc,
+                    prefill_meta.kv_start_loc,
+                    prefill_meta.max_query_len,
+                    prefill_meta.max_kv_len,
+                    self.scale,
+                    True,
+                    prefill_meta.prefill_block_tables,
+                    self.alibi_slopes,
+                )
+
+        if decode_meta := attn_metadata.decode_metadata:
+            assert attn_type != AttentionType.ENCODER_ONLY, (
+                "Encoder-only models should not have decode metadata.")
+            # Decoding run.
+            (
+                seq_lens_arg,
+                max_seq_len_arg,
+                block_tables_arg,
+            ) = decode_meta.get_seq_len_block_table_args(attn_type)
+
+            self.paged_attn_impl.forward_decode(
+                output[attn_metadata.num_prefill_tokens:, :, :],
+                query[attn_metadata.num_prefill_tokens:, :, :],
+                key_cache,
+                value_cache,
+                block_tables_arg,
+                seq_lens_arg,
+                max_seq_len_arg,
+                self.kv_cache_dtype,
+                self.num_kv_heads,
+                self.scale,
+                self.alibi_slopes,
+                layer._k_scale,
+                layer._v_scale,
+            )
+
+        # Reshape the output tensor.
+        return output.view(-1, self.num_heads * self.head_size)
+
+    def _run_sdpa_forward(
+        self,
+        output: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_metadata: TorchSDPAMetadata,
+        attn_type: str = AttentionType.DECODER,
+    ) -> None:
+        if self.num_kv_heads != self.num_heads:
+            key = key.repeat_interleave(self.num_queries_per_kv, dim=1)
+            value = value.repeat_interleave(self.num_queries_per_kv, dim=1)
+
+        attn_masks = attn_metadata.get_attn_bias(attn_type)
+        if attn_masks is None:
+            if self.alibi_slopes is not None:
+                attn_masks = _make_alibi_bias(
+                    self.alibi_slopes, query.dtype,
+                    attn_metadata.seq_lens)  # type: ignore
+            elif self.sliding_window is not None:
+                assert attn_metadata.seq_lens is not None
+                attn_masks = _make_sliding_window_bias(
+                    attn_metadata.seq_lens, self.sliding_window,
+                    query.dtype)  # type: ignore
+            else:
+                seq_lens, _ = attn_metadata.get_seq_lens(attn_type)
+                attn_masks = [None] * len(seq_lens)
+            attn_metadata.set_attn_bias(attn_masks, attn_type)
+
+        query = query.movedim(0, query.dim() - 2)
+        key = key.movedim(0, key.dim() - 2)
+        value = value.movedim(0, value.dim() - 2)
+
+        causal_attn = (attn_type == AttentionType.DECODER)
+
+        seq_lens_q, seq_lens_kv = attn_metadata.get_seq_lens(attn_type)
+        start_q, start_kv = 0, 0
+        for seq_len_q, seq_len_kv, mask in zip(seq_lens_q, seq_lens_kv,
+                                               attn_masks):
+            end_q = start_q + seq_len_q
+            end_kv = start_kv + seq_len_kv
+            sub_out = scaled_dot_product_attention(
+                query[None, :, start_q:end_q, :],
+                key[None, :, start_kv:end_kv, :],
+                value[None, :, start_kv:end_kv, :],
+                attn_mask=mask,
+                dropout_p=0.0,
+                is_causal=causal_attn and mask is None,
+                scale=self.scale).squeeze(0).movedim(query.dim() - 2, 0)
+            output[start_q:end_q, :, :] = sub_out
+            start_q, start_kv = end_q, end_kv
+
+
+def _make_alibi_bias(
+    alibi_slopes: torch.Tensor,
+    dtype: torch.dtype,
+    seq_lens: list[int],
+) -> list[torch.Tensor]:
+    attn_biases: list[torch.Tensor] = []
+    for seq_len in seq_lens:
+        bias = torch.arange(seq_len, dtype=dtype)
+        # NOTE(zhuohan): HF uses
+        #     `bias = bias[None, :].repeat(seq_len, 1)`
+        # here. We find that both biases give the same results, but
+        # the bias below more accurately follows the original ALiBi
+        # paper.
+        bias = bias[None, :] - bias[:, None]
+
+        num_heads = alibi_slopes.shape[0]
+        bias = bias[None, :].repeat((num_heads, 1, 1))
+        bias.mul_(alibi_slopes[:, None, None]).unsqueeze_(0)
+        inf_mask = torch.empty(
+            (1, seq_len, seq_len),
+            dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1)
+        attn_biases.append((bias + inf_mask).to(dtype))
+
+    return attn_biases
+
+
+def _make_sliding_window_bias(
+    seq_lens: list[int],
+    window_size: Optional[int],
+    dtype: torch.dtype,
+) -> list[torch.Tensor]:
+    attn_biases: list[torch.Tensor] = []
+    for seq_len in seq_lens:
+        tensor = torch.full(
+            (1, seq_len, seq_len),
+            dtype=dtype,
+            fill_value=1,
+        )
+        shift = 0
+        mask = torch.tril(tensor, diagonal=shift).to(dtype)  # type: ignore
+        if window_size is not None:
+            mask = torch.triu(mask, diagonal=shift - window_size + 1)
+        mask = torch.log(mask)
+        attn_biases.append(mask.to(dtype))
+
+    return attn_biases
+
+
+class _PagedAttention:
+
+    @staticmethod
+    def validate_head_size(head_size: int) -> tuple[bool, list[int]]:
+        SUPPORT_HS = [32, 64, 80, 96, 112, 128, 192, 256]
+        return head_size in SUPPORT_HS, SUPPORT_HS
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+        *args,
+    ) -> tuple[int, ...]:
+        return 2, num_blocks, block_size * num_kv_heads * head_size
+
+    @staticmethod
+    def split_kv_cache(
+        kv_cache: torch.Tensor,
+        num_kv_heads: int,
+        head_size: int,
+        *args,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        x = 16 // kv_cache.element_size()
+        num_blocks = kv_cache.shape[1]
+
+        key_cache = kv_cache[0]
+        key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x,
+                                   -1, x)
+        value_cache = kv_cache[1]
+        value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1)
+        return key_cache, value_cache
+
+    @staticmethod
+    def write_to_paged_cache(
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        kv_cache_dtype: str,
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
+        *args,
+    ) -> None:
+        ops.reshape_and_cache(
+            key,
+            value,
+            key_cache,
+            value_cache,
+            slot_mapping.flatten(),
+            kv_cache_dtype,
+            k_scale,
+            v_scale,
+        )
+
+    @staticmethod
+    def forward_decode(
+        output: torch.Tensor,
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_tables: torch.Tensor,
+        context_lens: torch.Tensor,
+        max_context_len: int,
+        kv_cache_dtype: str,
+        num_kv_heads: int,
+        scale: float,
+        alibi_slopes: Optional[torch.Tensor],
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
+        *args,
+    ) -> None:
+        tp_rank: int = 0
+        blocksparse_local_blocks: int = 0
+        blocksparse_vert_stride: int = 0
+        blocksparse_block_size: int = 64
+        blocksparse_head_sliding_step: int = 0
+        block_size = value_cache.shape[3]
+
+        ops.paged_attention_v1(
+            output,
+            query,
+            key_cache,
+            value_cache,
+            num_kv_heads,
+            scale,
+            block_tables,
+            context_lens,
+            block_size,
+            max_context_len,
+            alibi_slopes,
+            kv_cache_dtype,
+            k_scale,
+            v_scale,
+            tp_rank,
+            blocksparse_local_blocks,
+            blocksparse_vert_stride,
+            blocksparse_block_size,
+            blocksparse_head_sliding_step,
+        )
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: list[torch.Tensor],
+        src_to_dists: torch.Tensor,
+        *args,
+    ) -> None:
+        key_caches = [kv_cache[0] for kv_cache in kv_caches]
+        value_caches = [kv_cache[1] for kv_cache in kv_caches]
+        ops.copy_blocks(key_caches, value_caches, src_to_dists)
+
+
+class _IPEXPagedAttention(_PagedAttention):
+
+    @staticmethod
+    def validate_head_size(head_size: int) -> tuple[bool, list[int]]:
+        return True, []
+
+    @staticmethod
+    def split_kv_cache(
+        kv_cache: torch.Tensor,
+        num_kv_heads: int,
+        head_size: int,
+        *args,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        num_blocks = kv_cache.shape[1]
+
+        key_cache = kv_cache[0]
+        key_cache = key_cache.view(num_blocks, num_kv_heads, -1, head_size)
+        value_cache = kv_cache[1]
+        value_cache = value_cache.view(num_blocks, num_kv_heads, -1, head_size)
+        return key_cache, value_cache
+
+    @staticmethod
+    def write_to_paged_cache(
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        kv_cache_dtype: str,
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
+        *args,
+    ) -> None:
+        ipex_modules.PagedAttention.reshape_and_cache(
+            key, value, key_cache, value_cache,
+            slot_mapping.flatten().int())
+
+    @staticmethod
+    def forward_decode(
+        output: torch.Tensor,
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_tables: torch.Tensor,
+        context_lens: torch.Tensor,
+        max_context_len: int,
+        kv_cache_dtype: str,
+        num_kv_heads: int,
+        scale: float,
+        alibi_slopes: Optional[torch.Tensor],
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
+        *args,
+    ) -> None:
+        block_size = value_cache.shape[2]
+        head_mapping = torch.arange(
+            0,
+            num_kv_heads,
+            device="cpu",
+            dtype=torch.int32,
+        ).view(num_kv_heads,
+               1).repeat_interleave(query.size(1) // num_kv_heads).flatten()
+        ipex_modules.PagedAttention.single_query_cached_kv_attention(
+            output, query.contiguous(), key_cache, value_cache, head_mapping,
+            scale, block_tables, context_lens, block_size, max_context_len,
+            alibi_slopes)
+
+
+def _get_paged_attn_impl():
+    if _use_ipex:
+        return _IPEXPagedAttention
+    else:
+        return _PagedAttention

From e34d130c1613dbabc708cd5f059506c887ac81b4 Mon Sep 17 00:00:00 2001
From: Chenyaaang <42742451+Chenyaaang@users.noreply.github.com>
Date: Mon, 7 Jul 2025 22:16:16 -0700
Subject: [PATCH 157/167] [TPU] Temporary fix vmem oom for long model len by
 reducing page size (#20278)

Signed-off-by: Chenyaaang <chenyangli@google.com>
---
 vllm/v1/attention/backends/pallas.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index 253d79d925cef..2921e8ed55abe 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -86,6 +86,12 @@ class PallasAttentionBackend(AttentionBackend):
     # spill less likely. Meanwhile we make sure the page size is in [16, 256].
     @staticmethod
     def get_page_size(vllm_config: VllmConfig) -> int:
+        # TODO: This is a temporary fix for vmem OOM.
+        # For long model length, we use 16 page-size to avoid too much
+        # VMEM spill. A more robust solution should be implemented to
+        # handle VREG spills.
+        if vllm_config.model_config.max_model_len > 8192:
+            return 16
         page_size = next_power_of_2(
             vllm_config.model_config.max_model_len) // 16
         if page_size <= 16:

From 72d14d0eed4b29e5827519283c085a7a674f3256 Mon Sep 17 00:00:00 2001
From: Sanger Steel <sangersteel@gmail.com>
Date: Tue, 8 Jul 2025 01:47:43 -0400
Subject: [PATCH 158/167] [Frontend] [Core] Integrate Tensorizer in to S3
 loading machinery, allow passing arbitrary arguments during save/load
 (#19619)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
Co-authored-by: Eta <esyra@coreweave.com>
---
 examples/others/tensorize_vllm_model.py       | 108 +++--
 requirements/nightly_torch_test.txt           |   2 +-
 requirements/rocm.txt                         |   2 +-
 requirements/test.in                          |   2 +-
 requirements/test.txt                         |   2 +-
 setup.py                                      |   2 +-
 .../openai/test_tensorizer_entrypoint.py      |  18 +-
 tests/lora/test_llama_tp.py                   |   5 +-
 tests/tensorizer_loader/conftest.py           |  85 ++++
 tests/tensorizer_loader/test_tensorizer.py    | 320 +++++++++++++-
 vllm/config.py                                |  10 +-
 vllm/engine/arg_utils.py                      |  36 +-
 vllm/lora/models.py                           |   7 +-
 vllm/lora/peft_helper.py                      |   2 +-
 .../model_executor/model_loader/tensorizer.py | 392 ++++++++++++------
 .../model_loader/tensorizer_loader.py         |  15 +
 vllm/v1/worker/gpu_model_runner.py            |   1 +
 vllm/worker/model_runner.py                   |   1 +
 18 files changed, 814 insertions(+), 196 deletions(-)

diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py
index 11233229561bc..64a6c42ae2357 100644
--- a/examples/others/tensorize_vllm_model.py
+++ b/examples/others/tensorize_vllm_model.py
@@ -4,6 +4,7 @@
 import argparse
 import dataclasses
 import json
+import logging
 import os
 import uuid
 
@@ -15,9 +16,13 @@ from vllm.model_executor.model_loader.tensorizer import (
     TensorizerConfig,
     tensorize_lora_adapter,
     tensorize_vllm_model,
+    tensorizer_kwargs_arg,
 )
 from vllm.utils import FlexibleArgumentParser
 
+logger = logging.getLogger()
+
+
 # yapf conflicts with isort for this docstring
 # yapf: disable
 """
@@ -119,7 +124,7 @@ vllm serve <model_path> \
 """
 
 
-def parse_args():
+def get_parser():
     parser = FlexibleArgumentParser(
         description="An example script that can be used to serialize and "
         "deserialize vLLM models. These models "
@@ -135,13 +140,13 @@ def parse_args():
         required=False,
         help="Path to a LoRA adapter to "
         "serialize along with model tensors. This can then be deserialized "
-        "along with the model by passing a tensorizer_config kwarg to "
-        "LoRARequest with type TensorizerConfig. See the docstring for this "
-        "for a usage example."
-
+        "along with the model by instantiating a TensorizerConfig object, "
+        "creating a dict from it with TensorizerConfig.to_serializable(), "
+        "and passing it to LoRARequest's initializer with the kwarg "
+        "tensorizer_config_dict."
     )
 
-    subparsers = parser.add_subparsers(dest='command')
+    subparsers = parser.add_subparsers(dest='command', required=True)
 
     serialize_parser = subparsers.add_parser(
         'serialize', help="Serialize a model to `--serialized-directory`")
@@ -171,6 +176,14 @@ def parse_args():
         "where `suffix` is given by `--suffix` or a random UUID if not "
         "provided.")
 
+    serialize_parser.add_argument(
+        "--serialization-kwargs",
+        type=tensorizer_kwargs_arg,
+        required=False,
+        help=("A JSON string containing additional keyword arguments to "
+              "pass to Tensorizer's TensorSerializer during "
+              "serialization."))
+
     serialize_parser.add_argument(
         "--keyfile",
         type=str,
@@ -186,9 +199,17 @@ def parse_args():
     deserialize_parser.add_argument(
         "--path-to-tensors",
         type=str,
-        required=True,
+        required=False,
         help="The local path or S3 URI to the model tensors to deserialize. ")
 
+    deserialize_parser.add_argument(
+        "--serialized-directory",
+        type=str,
+        required=False,
+        help="Directory with model artifacts for loading. Assumes a "
+             "model.tensors file exists therein. Can supersede "
+             "--path-to-tensors.")
+
     deserialize_parser.add_argument(
         "--keyfile",
         type=str,
@@ -196,11 +217,27 @@ def parse_args():
         help=("Path to a binary key to use to decrypt the model weights,"
               " if the model was serialized with encryption"))
 
+    deserialize_parser.add_argument(
+        "--deserialization-kwargs",
+        type=tensorizer_kwargs_arg,
+        required=False,
+        help=("A JSON string containing additional keyword arguments to "
+              "pass to Tensorizer's `TensorDeserializer` during "
+              "deserialization."))
+
     TensorizerArgs.add_cli_args(deserialize_parser)
 
-    return parser.parse_args()
-
+    return parser
 
+def merge_extra_config_with_tensorizer_config(extra_cfg: dict,
+                                              cfg: TensorizerConfig):
+    for k, v in extra_cfg.items():
+        if hasattr(cfg, k):
+            setattr(cfg, k, v)
+            logger.info(
+                "Updating TensorizerConfig with %s from "
+                "--model-loader-extra-config provided", k
+            )
 
 def deserialize(args, tensorizer_config):
     if args.lora_path:
@@ -230,7 +267,8 @@ def deserialize(args, tensorizer_config):
             lora_request=LoRARequest("sql-lora",
                                      1,
                                      args.lora_path,
-                                     tensorizer_config = tensorizer_config)
+                                     tensorizer_config_dict = tensorizer_config
+                                     .to_serializable())
             )
         )
     else:
@@ -243,7 +281,8 @@ def deserialize(args, tensorizer_config):
 
 
 def main():
-    args = parse_args()
+    parser = get_parser()
+    args = parser.parse_args()
 
     s3_access_key_id = (getattr(args, 's3_access_key_id', None)
                         or os.environ.get("S3_ACCESS_KEY_ID", None))
@@ -265,13 +304,24 @@ def main():
     else:
         keyfile = None
 
+    extra_config = {}
     if args.model_loader_extra_config:
-        config = json.loads(args.model_loader_extra_config)
-        tensorizer_args = \
-            TensorizerConfig(**config)._construct_tensorizer_args()
-        tensorizer_args.tensorizer_uri = args.path_to_tensors
-    else:
-        tensorizer_args = None
+        extra_config = json.loads(args.model_loader_extra_config)
+
+
+    tensorizer_dir = (args.serialized_directory or
+                      extra_config.get("tensorizer_dir"))
+    tensorizer_uri = (getattr(args, "path_to_tensors", None)
+                      or extra_config.get("tensorizer_uri"))
+
+    if tensorizer_dir and tensorizer_uri:
+        parser.error("--serialized-directory and --path-to-tensors "
+                     "cannot both be provided")
+
+    if not tensorizer_dir and not tensorizer_uri:
+        parser.error("Either --serialized-directory or --path-to-tensors "
+                     "must be provided")
+
 
     if args.command == "serialize":
         eng_args_dict = {f.name: getattr(args, f.name) for f in
@@ -281,7 +331,7 @@ def main():
             argparse.Namespace(**eng_args_dict)
         )
 
-        input_dir = args.serialized_directory.rstrip('/')
+        input_dir = tensorizer_dir.rstrip('/')
         suffix = args.suffix if args.suffix else uuid.uuid4().hex
         base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
         if engine_args.tensor_parallel_size > 1:
@@ -292,21 +342,29 @@ def main():
         tensorizer_config = TensorizerConfig(
             tensorizer_uri=model_path,
             encryption_keyfile=keyfile,
-            **credentials)
+            serialization_kwargs=args.serialization_kwargs or {},
+            **credentials
+        )
 
         if args.lora_path:
             tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
             tensorize_lora_adapter(args.lora_path, tensorizer_config)
 
+        merge_extra_config_with_tensorizer_config(extra_config,
+                                                  tensorizer_config)
         tensorize_vllm_model(engine_args, tensorizer_config)
 
     elif args.command == "deserialize":
-        if not tensorizer_args:
-            tensorizer_config = TensorizerConfig(
-                tensorizer_uri=args.path_to_tensors,
-                encryption_keyfile = keyfile,
-                **credentials
-            )
+        tensorizer_config = TensorizerConfig(
+            tensorizer_uri=args.path_to_tensors,
+            tensorizer_dir=args.serialized_directory,
+            encryption_keyfile=keyfile,
+            deserialization_kwargs=args.deserialization_kwargs or {},
+            **credentials
+        )
+
+        merge_extra_config_with_tensorizer_config(extra_config,
+                                                  tensorizer_config)
         deserialize(args, tensorizer_config)
     else:
         raise ValueError("Either serialize or deserialize must be specified.")
diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
index 0bade084fdf6f..d8bd031f1d749 100644
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -1,6 +1,6 @@
 # testing
 pytest
-tensorizer>=2.9.0
+tensorizer==2.10.1
 pytest-forked
 pytest-asyncio
 pytest-rerunfailures
diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index d33021fc7597e..988329c3a212d 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -11,7 +11,7 @@ datasets
 ray>=2.10.0,<2.45.0
 peft
 pytest-asyncio
-tensorizer>=2.9.0
+tensorizer==2.10.1
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
diff --git a/requirements/test.in b/requirements/test.in
index 5f8b97a0e3415..907d90201a243 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -1,6 +1,6 @@
 # testing
 pytest
-tensorizer>=2.9.0
+tensorizer==2.10.1
 pytest-forked
 pytest-asyncio
 pytest-rerunfailures
diff --git a/requirements/test.txt b/requirements/test.txt
index f6f599df758f1..2f3ccc4f61df5 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -739,7 +739,7 @@ tenacity==9.0.0
     # via
     #   lm-eval
     #   plotly
-tensorizer==2.9.0
+tensorizer==2.10.1
     # via -r requirements/test.in
 threadpoolctl==3.5.0
     # via scikit-learn
diff --git a/setup.py b/setup.py
index ea7cd0169c8bb..9200c6cef5a39 100644
--- a/setup.py
+++ b/setup.py
@@ -689,7 +689,7 @@ setup(
     install_requires=get_requirements(),
     extras_require={
         "bench": ["pandas", "datasets"],
-        "tensorizer": ["tensorizer>=2.9.0"],
+        "tensorizer": ["tensorizer==2.10.1"],
         "fastsafetensors": ["fastsafetensors >= 0.1.10"],
         "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
         "audio": ["librosa", "soundfile"],  # Required for audio processing
diff --git a/tests/entrypoints/openai/test_tensorizer_entrypoint.py b/tests/entrypoints/openai/test_tensorizer_entrypoint.py
index e143150356d92..4bf3798503656 100644
--- a/tests/entrypoints/openai/test_tensorizer_entrypoint.py
+++ b/tests/entrypoints/openai/test_tensorizer_entrypoint.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
-import json
+import os
 import tempfile
 
 import openai
@@ -58,18 +58,20 @@ def tensorize_model_and_lora(tmp_dir, model_uri):
 
 @pytest.fixture(scope="module")
 def server(model_uri, tensorize_model_and_lora):
-    model_loader_extra_config = {
-        "tensorizer_uri": model_uri,
-    }
+    # In this case, model_uri is a directory with a model.tensors
+    # file and all necessary model artifacts, particularly a
+    # HF `config.json` file. In this case, Tensorizer can infer the
+    # `TensorizerConfig` so --model-loader-extra-config can be completely
+    # omitted.
 
     ## Start OpenAI API server
     args = [
-        "--load-format", "tensorizer", "--device", "cuda",
-        "--model-loader-extra-config",
-        json.dumps(model_loader_extra_config), "--enable-lora"
+        "--load-format", "tensorizer", "--served-model-name", MODEL_NAME,
+        "--enable-lora"
     ]
 
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    model_dir = os.path.dirname(model_uri)
+    with RemoteOpenAIServer(model_dir, args) as remote_server:
         yield remote_server
 
 
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 3ac3b80ec827c..91afa42faf957 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -169,7 +169,8 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
             f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model",
             MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size",
             str(tp_size), "serialize", "--serialized-directory",
-            str(tmp_path), "--suffix", suffix
+            str(tmp_path), "--suffix", suffix, "--serialization-kwargs",
+            '{"limit_cpu_concurrency": 4}'
         ],
                                 check=True,
                                 capture_output=True,
@@ -195,7 +196,7 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
                             tensor_parallel_size=2,
                             max_loras=2)
 
-    tensorizer_config_dict = tensorizer_config.to_dict()
+    tensorizer_config_dict = tensorizer_config.to_serializable()
 
     print("lora adapter created")
     assert do_sample(loaded_vllm_model,
diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py
index cd59d579e8d6f..18aa4c88c0338 100644
--- a/tests/tensorizer_loader/conftest.py
+++ b/tests/tensorizer_loader/conftest.py
@@ -1,9 +1,28 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Callable
+
 import pytest
 
+from vllm import LLM, EngineArgs
 from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.model_executor.model_loader import tensorizer as tensorizer_mod
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.v1.executor.abstract import UniProcExecutor
+from vllm.worker.worker_base import WorkerWrapperBase
+
+MODEL_REF = "facebook/opt-125m"
+
+
+@pytest.fixture()
+def model_ref():
+    return MODEL_REF
+
+
+@pytest.fixture(autouse=True)
+def allow_insecure_serialization(monkeypatch):
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
 
 @pytest.fixture(autouse=True)
@@ -11,7 +30,73 @@ def cleanup():
     cleanup_dist_env_and_memory(shutdown_ray=True)
 
 
+@pytest.fixture()
+def just_serialize_model_tensors(model_ref, monkeypatch, tmp_path):
+
+    def noop(*args, **kwargs):
+        return None
+
+    args = EngineArgs(model=model_ref)
+    tc = TensorizerConfig(tensorizer_uri=f"{tmp_path}/model.tensors")
+
+    monkeypatch.setattr(tensorizer_mod, "serialize_extra_artifacts", noop)
+
+    tensorizer_mod.tensorize_vllm_model(args, tc)
+    yield tmp_path
+
+
 @pytest.fixture(autouse=True)
 def tensorizer_config():
     config = TensorizerConfig(tensorizer_uri="vllm")
     return config
+
+
+@pytest.fixture()
+def model_path(model_ref, tmp_path):
+    yield tmp_path / model_ref / "model.tensors"
+
+
+def assert_from_collective_rpc(engine: LLM, closure: Callable,
+                               closure_kwargs: dict):
+    res = engine.collective_rpc(method=closure, kwargs=closure_kwargs)
+    return all(res)
+
+
+# This is an object pulled from tests/v1/engine/test_engine_core.py
+# Modified to strip the `load_model` method from its `_init_executor`
+# method. It's purely used as a dummy utility to run methods that test
+# Tensorizer functionality
+class DummyExecutor(UniProcExecutor):
+
+    def _init_executor(self) -> None:
+        """Initialize the worker and load the model.
+        """
+        self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config,
+                                               rpc_rank=0)
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+        local_rank = 0
+        # set local rank as the device index if specified
+        device_info = self.vllm_config.device_config.device.__str__().split(
+            ":")
+        if len(device_info) > 1:
+            local_rank = int(device_info[1])
+        rank = 0
+        is_driver_worker = True
+        kwargs = dict(
+            vllm_config=self.vllm_config,
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            is_driver_worker=is_driver_worker,
+        )
+        self.collective_rpc("init_worker", args=([kwargs], ))
+        self.collective_rpc("init_device")
+
+    @property
+    def max_concurrent_batches(self) -> int:
+        return 2
+
+    def shutdown(self):
+        if hasattr(self, 'thread_pool'):
+            self.thread_pool.shutdown(wait=False)
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index c97f5968d58a2..9fe230512a0dc 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -1,36 +1,51 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import asyncio
 import gc
+import json
 import os
 import pathlib
 import subprocess
+import sys
+from typing import Any
 
 import pytest
 import torch
 
-from vllm import SamplingParams
+import vllm.model_executor.model_loader.tensorizer
+from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-# yapf conflicts with isort for this docstring
 # yapf: disable
 from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
                                                          TensorSerializer,
                                                          is_vllm_tensorized,
                                                          open_stream,
                                                          tensorize_vllm_model)
+from vllm.model_executor.model_loader.tensorizer_loader import (
+    BLACKLISTED_TENSORIZER_ARGS)
 # yapf: enable
 from vllm.utils import PlaceholderModule
 
-from ..utils import VLLM_PATH
+from ..utils import VLLM_PATH, RemoteOpenAIServer
+from .conftest import DummyExecutor, assert_from_collective_rpc
 
 try:
+    import tensorizer
     from tensorizer import EncryptionParams
 except ImportError:
     tensorizer = PlaceholderModule("tensorizer")  # type: ignore[assignment]
     EncryptionParams = tensorizer.placeholder_attr("EncryptionParams")
 
+
+class TensorizerCaughtError(Exception):
+    pass
+
+
 EXAMPLES_PATH = VLLM_PATH / "examples"
 
+pytest_plugins = "pytest_asyncio",
+
 prompts = [
     "Hello, my name is",
     "The president of the United States is",
@@ -40,9 +55,37 @@ prompts = [
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
 
-model_ref = "facebook/opt-125m"
-tensorize_model_for_testing_script = os.path.join(
-    os.path.dirname(__file__), "tensorize_vllm_model_for_testing.py")
+
+def patch_init_and_catch_error(self, obj, method_name,
+                               expected_error: type[Exception]):
+    original = getattr(obj, method_name, None)
+    if original is None:
+        raise ValueError("Method '{}' not found.".format(method_name))
+
+    def wrapper(*args, **kwargs):
+        try:
+            return original(*args, **kwargs)
+        except expected_error as err:
+            raise TensorizerCaughtError from err
+
+    setattr(obj, method_name, wrapper)
+
+    self.load_model()
+
+
+def assert_specific_tensorizer_error_is_raised(
+    executor,
+    obj: Any,
+    method_name: str,
+    expected_error: type[Exception],
+):
+    with pytest.raises(TensorizerCaughtError):
+        executor.collective_rpc(patch_init_and_catch_error,
+                                args=(
+                                    obj,
+                                    method_name,
+                                    expected_error,
+                                ))
 
 
 def is_curl_installed():
@@ -81,11 +124,10 @@ def test_can_deserialize_s3(vllm_runner):
 
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_deserialized_encrypted_vllm_model_has_same_outputs(
-        vllm_runner, tmp_path):
+        model_ref, vllm_runner, tmp_path, model_path):
     args = EngineArgs(model=model_ref)
     with vllm_runner(model_ref) as vllm_model:
-        model_path = tmp_path / (model_ref + ".tensors")
-        key_path = tmp_path / (model_ref + ".key")
+        key_path = tmp_path / model_ref / "model.key"
         write_keyfile(key_path)
 
         outputs = vllm_model.generate(prompts, sampling_params)
@@ -111,9 +153,9 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
 
 
 def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
-                                                tmp_path):
+                                                tmp_path, model_ref,
+                                                model_path):
     with hf_runner(model_ref) as hf_model:
-        model_path = tmp_path / (model_ref + ".tensors")
         max_tokens = 50
         outputs = hf_model.generate_greedy(prompts, max_tokens=max_tokens)
         with open_stream(model_path, "wb+") as stream:
@@ -123,7 +165,7 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
     with vllm_runner(model_ref,
                      load_format="tensorizer",
                      model_loader_extra_config=TensorizerConfig(
-                         tensorizer_uri=model_path,
+                         tensorizer_uri=str(model_path),
                          num_readers=1,
                      )) as loaded_hf_model:
         deserialized_outputs = loaded_hf_model.generate_greedy(
@@ -132,7 +174,7 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
         assert outputs == deserialized_outputs
 
 
-def test_load_without_tensorizer_load_format(vllm_runner, capfd):
+def test_load_without_tensorizer_load_format(vllm_runner, capfd, model_ref):
     model = None
     try:
         model = vllm_runner(
@@ -150,7 +192,8 @@ def test_load_without_tensorizer_load_format(vllm_runner, capfd):
         torch.cuda.empty_cache()
 
 
-def test_raise_value_error_on_invalid_load_format(vllm_runner, capfd):
+def test_raise_value_error_on_invalid_load_format(vllm_runner, capfd,
+                                                  model_ref):
     model = None
     try:
         model = vllm_runner(
@@ -208,7 +251,7 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(
         outputs = base_model.generate(prompts, sampling_params)
 
     # load model with two shards and serialize with encryption
-    model_path = str(tmp_path / (model_ref + "-%02d.tensors"))
+    model_path = str(tmp_path / model_ref / "model-%02d.tensors")
     key_path = tmp_path / (model_ref + ".key")
 
     tensorizer_config = TensorizerConfig(
@@ -242,13 +285,12 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(
 
 
 @pytest.mark.flaky(reruns=3)
-def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
+def test_vllm_tensorized_model_has_same_outputs(model_ref, vllm_runner,
+                                                tmp_path, model_path):
     gc.collect()
     torch.cuda.empty_cache()
-    model_ref = "facebook/opt-125m"
-    model_path = tmp_path / (model_ref + ".tensors")
     config = TensorizerConfig(tensorizer_uri=str(model_path))
-    args = EngineArgs(model=model_ref, device="cuda")
+    args = EngineArgs(model=model_ref)
 
     with vllm_runner(model_ref) as vllm_model:
         outputs = vllm_model.generate(prompts, sampling_params)
@@ -264,3 +306,243 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
         # noqa: E501
 
         assert outputs == deserialized_outputs
+
+
+def test_load_with_just_model_tensors(just_serialize_model_tensors, model_ref):
+    # For backwards compatibility, ensure Tensorizer can be still be loaded
+    # for inference by passing the model reference name, not a local/S3 dir,
+    # and the location of the model tensors
+
+    model_dir = just_serialize_model_tensors
+
+    extra_config = {"tensorizer_uri": f"{model_dir}/model.tensors"}
+
+    ## Start OpenAI API server
+    args = [
+        "--load-format",
+        "tensorizer",
+        "--model-loader-extra-config",
+        json.dumps(extra_config),
+    ]
+
+    with RemoteOpenAIServer(model_ref, args):
+        # This test only concerns itself with being able to load the model
+        # and successfully initialize the server
+        pass
+
+
+def test_assert_serialization_kwargs_passed_to_tensor_serializer(tmp_path):
+
+    serialization_params = {
+        "limit_cpu_concurrency": 2,
+    }
+    model_ref = "facebook/opt-125m"
+    model_path = tmp_path / (model_ref + ".tensors")
+    config = TensorizerConfig(tensorizer_uri=str(model_path),
+                              serialization_kwargs=serialization_params)
+    llm = LLM(model=model_ref, )
+
+    def serialization_test(self, *args, **kwargs):
+        # This is performed in the ephemeral worker process, so monkey-patching
+        # will actually work, and cleanup is guaranteed so don't
+        # need to reset things
+
+        original_dict = serialization_params
+        to_compare = {}
+
+        original = tensorizer.serialization.TensorSerializer.__init__
+
+        def tensorizer_serializer_wrapper(self, *args, **kwargs):
+            nonlocal to_compare
+            to_compare = kwargs.copy()
+            return original(self, *args, **kwargs)
+
+        tensorizer.serialization.TensorSerializer.__init__ = (
+            tensorizer_serializer_wrapper)
+
+        tensorizer_config = TensorizerConfig(**kwargs["tensorizer_config"])
+        self.save_tensorized_model(tensorizer_config=tensorizer_config, )
+        return to_compare | original_dict == to_compare
+
+    kwargs = {"tensorizer_config": config.to_serializable()}
+
+    assert assert_from_collective_rpc(llm, serialization_test, kwargs)
+
+
+def test_assert_deserialization_kwargs_passed_to_tensor_deserializer(
+        tmp_path, capfd):
+
+    deserialization_kwargs = {
+        "num_readers": "bar",  # illegal value
+    }
+
+    serialization_params = {
+        "limit_cpu_concurrency": 2,
+    }
+
+    model_ref = "facebook/opt-125m"
+    model_path = tmp_path / (model_ref + ".tensors")
+    config = TensorizerConfig(tensorizer_uri=str(model_path),
+                              serialization_kwargs=serialization_params)
+
+    args = EngineArgs(model=model_ref)
+    tensorize_vllm_model(args, config)
+
+    loader_tc = TensorizerConfig(
+        tensorizer_uri=str(model_path),
+        deserialization_kwargs=deserialization_kwargs,
+    )
+
+    engine_args = EngineArgs(
+        model="facebook/opt-125m",
+        load_format="tensorizer",
+        model_loader_extra_config=loader_tc.to_serializable(),
+    )
+
+    vllm_config = engine_args.create_engine_config()
+    executor = DummyExecutor(vllm_config)
+
+    assert_specific_tensorizer_error_is_raised(
+        executor,
+        tensorizer.serialization.TensorDeserializer,
+        "__init__",
+        TypeError,
+    )
+
+
+def test_assert_stream_kwargs_passed_to_tensor_deserializer(tmp_path, capfd):
+
+    deserialization_kwargs = {
+        "num_readers": 1,
+    }
+
+    serialization_params = {
+        "limit_cpu_concurrency": 2,
+    }
+
+    model_ref = "facebook/opt-125m"
+    model_path = tmp_path / (model_ref + ".tensors")
+    config = TensorizerConfig(tensorizer_uri=str(model_path),
+                              serialization_kwargs=serialization_params)
+
+    args = EngineArgs(model=model_ref)
+    tensorize_vllm_model(args, config)
+
+    stream_kwargs = {"mode": "foo"}
+
+    loader_tc = TensorizerConfig(
+        tensorizer_uri=str(model_path),
+        deserialization_kwargs=deserialization_kwargs,
+        stream_kwargs=stream_kwargs,
+    )
+
+    engine_args = EngineArgs(
+        model="facebook/opt-125m",
+        load_format="tensorizer",
+        model_loader_extra_config=loader_tc.to_serializable(),
+    )
+
+    vllm_config = engine_args.create_engine_config()
+    executor = DummyExecutor(vllm_config)
+
+    assert_specific_tensorizer_error_is_raised(
+        executor,
+        vllm.model_executor.model_loader.tensorizer,
+        "open_stream",
+        ValueError,
+    )
+
+
+@pytest.mark.asyncio
+async def test_serialize_and_serve_entrypoints(tmp_path):
+    model_ref = "facebook/opt-125m"
+
+    suffix = "test"
+    try:
+        result = subprocess.run([
+            sys.executable,
+            f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model",
+            model_ref, "serialize", "--serialized-directory",
+            str(tmp_path), "--suffix", suffix, "--serialization-kwargs",
+            '{"limit_cpu_concurrency": 4}'
+        ],
+                                check=True,
+                                capture_output=True,
+                                text=True)
+    except subprocess.CalledProcessError as e:
+        print("Tensorizing failed.")
+        print("STDOUT:\n", e.stdout)
+        print("STDERR:\n", e.stderr)
+        raise
+
+    assert "Successfully serialized" in result.stdout
+
+    # Next, try to serve with vllm serve
+    model_uri = tmp_path / "vllm" / model_ref / suffix / "model.tensors"
+
+    model_loader_extra_config = {
+        "tensorizer_uri": str(model_uri),
+        "stream_kwargs": {
+            "force_http": False,
+        },
+        "deserialization_kwargs": {
+            "verify_hash": True,
+            "num_readers": 8,
+        }
+    }
+
+    cmd = [
+        "-m", "vllm.entrypoints.cli.main", "serve", "--host", "localhost",
+        "--load-format", "tensorizer", model_ref,
+        "--model-loader-extra-config",
+        json.dumps(model_loader_extra_config, indent=2)
+    ]
+
+    proc = await asyncio.create_subprocess_exec(
+        sys.executable,
+        *cmd,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.STDOUT,
+    )
+
+    assert proc.stdout is not None
+    fut = proc.stdout.readuntil(b"Application startup complete.")
+
+    try:
+        await asyncio.wait_for(fut, 180)
+    except asyncio.TimeoutError:
+        pytest.fail("Server did not start successfully")
+    finally:
+        proc.terminate()
+    await proc.communicate()
+
+
+@pytest.mark.parametrize("illegal_value", BLACKLISTED_TENSORIZER_ARGS)
+def test_blacklisted_parameter_for_loading(tmp_path, vllm_runner, capfd,
+                                           illegal_value):
+
+    serialization_params = {
+        "limit_cpu_concurrency": 2,
+    }
+
+    model_ref = "facebook/opt-125m"
+    model_path = tmp_path / (model_ref + ".tensors")
+    config = TensorizerConfig(tensorizer_uri=str(model_path),
+                              serialization_kwargs=serialization_params)
+
+    args = EngineArgs(model=model_ref)
+    tensorize_vllm_model(args, config)
+
+    loader_tc = {"tensorizer_uri": str(model_path), illegal_value: "foo"}
+
+    try:
+        vllm_runner(
+            model_ref,
+            load_format="tensorizer",
+            model_loader_extra_config=loader_tc,
+        )
+    except RuntimeError:
+        out, err = capfd.readouterr()
+        combined_output = out + err
+        assert (f"ValueError: {illegal_value} is not an allowed "
+                f"Tensorizer argument.") in combined_output
diff --git a/vllm/config.py b/vllm/config.py
index bac18e8175d39..90cf885a40d4c 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -686,8 +686,11 @@ class ModelConfig:
 
             # If tokenizer is same as model, download to same directory
             if model == tokenizer:
-                s3_model.pull_files(
-                    model, ignore_pattern=["*.pt", "*.safetensors", "*.bin"])
+                s3_model.pull_files(model,
+                                    ignore_pattern=[
+                                        "*.pt", "*.safetensors", "*.bin",
+                                        "*.tensors"
+                                    ])
                 self.tokenizer = s3_model.dir
                 return
 
@@ -695,7 +698,8 @@ class ModelConfig:
         if is_s3(tokenizer):
             s3_tokenizer = S3Model()
             s3_tokenizer.pull_files(
-                model, ignore_pattern=["*.pt", "*.safetensors", "*.bin"])
+                model,
+                ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors"])
             self.tokenizer = s3_tokenizer.dir
 
     def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index a497e3c8eeac6..0c4fae1dde5d1 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -58,7 +58,8 @@ def parse_type(return_type: Callable[[str], T]) -> Callable[[str], T]:
 
     def _parse_type(val: str) -> T:
         try:
-            if return_type is json.loads and not re.match("^{.*}$", val):
+            if return_type is json.loads and not re.match(
+                    r"(?s)^\s*{.*}\s*$", val):
                 return cast(T, nullable_kvs(val))
             return return_type(val)
         except ValueError as e:
@@ -80,7 +81,7 @@ def optional_type(
 
 
 def union_dict_and_str(val: str) -> Optional[Union[str, dict[str, str]]]:
-    if not re.match("^{.*}$", val):
+    if not re.match(r"(?s)^\s*{.*}\s*$", val):
         return str(val)
     return optional_type(json.loads)(val)
 
@@ -1001,11 +1002,42 @@ class EngineArgs:
             override_attention_dtype=self.override_attention_dtype,
         )
 
+    def valid_tensorizer_config_provided(self) -> bool:
+        """
+        Checks if a parseable TensorizerConfig was passed to
+        self.model_loader_extra_config. It first checks if the config passed
+        is a dict or a TensorizerConfig object directly, and if the latter is
+        true (by checking that the object has TensorizerConfig's
+        .to_serializable() method), converts it in to a serializable dict
+        format
+        """
+        if self.model_loader_extra_config:
+            if hasattr(self.model_loader_extra_config, "to_serializable"):
+                self.model_loader_extra_config = (
+                    self.model_loader_extra_config.to_serializable())
+            for allowed_to_pass in ["tensorizer_uri", "tensorizer_dir"]:
+                try:
+                    self.model_loader_extra_config[allowed_to_pass]
+                    return False
+                except KeyError:
+                    pass
+        return True
+
     def create_load_config(self) -> LoadConfig:
 
         if self.quantization == "bitsandbytes":
             self.load_format = "bitsandbytes"
 
+        if (self.load_format == "tensorizer"
+                and self.valid_tensorizer_config_provided()):
+            logger.info("Inferring Tensorizer args from %s", self.model)
+            self.model_loader_extra_config = {"tensorizer_dir": self.model}
+        else:
+            logger.info(
+                "Using Tensorizer args from --model-loader-extra-config. "
+                "Note that you can now simply pass the S3 directory in the "
+                "model tag instead of providing the JSON string.")
+
         return LoadConfig(
             load_format=self.load_format,
             download_dir=self.download_dir,
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 9e1ed3a771798..bff4e912578c9 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -245,9 +245,10 @@ class LoRAModel(AdapterModel):
             lora_tensor_path = os.path.join(tensorizer_config.tensorizer_dir,
                                             "adapter_model.tensors")
             tensorizer_args = tensorizer_config._construct_tensorizer_args()
-            tensors = TensorDeserializer(lora_tensor_path,
-                                         dtype=tensorizer_config.dtype,
-                                         **tensorizer_args.deserializer_params)
+            tensors = TensorDeserializer(
+                lora_tensor_path,
+                dtype=tensorizer_config.dtype,
+                **tensorizer_args.deserialization_kwargs)
             check_unexpected_modules(tensors)
 
         elif os.path.isfile(lora_tensor_path):
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
index a20d73f0f725b..e748a4a889244 100644
--- a/vllm/lora/peft_helper.py
+++ b/vllm/lora/peft_helper.py
@@ -106,7 +106,7 @@ class PEFTHelper:
                                             "adapter_config.json")
             with open_stream(lora_config_path,
                              mode="rb",
-                             **tensorizer_args.stream_params) as f:
+                             **tensorizer_args.stream_kwargs) as f:
                 config = json.load(f)
 
             logger.info("Successfully deserialized LoRA config from %s",
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 1c14d55fc0fe7..ff101b664130e 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -5,18 +5,18 @@ import argparse
 import contextlib
 import contextvars
 import dataclasses
-import io
 import json
 import os
+import tempfile
 import threading
 import time
-from collections.abc import Generator
-from dataclasses import dataclass
-from functools import partial
-from typing import TYPE_CHECKING, Any, BinaryIO, Optional, Union
+from collections.abc import Generator, MutableMapping
+from dataclasses import asdict, dataclass, field, fields
+from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union
 
 import regex as re
 import torch
+from huggingface_hub import snapshot_download
 from torch import nn
 from torch.utils._python_dispatch import TorchDispatchMode
 from transformers import PretrainedConfig
@@ -39,10 +39,6 @@ try:
     from tensorizer.utils import (convert_bytes, get_mem_usage,
                                   no_init_or_tensor)
 
-    _read_stream, _write_stream = (partial(
-        open_stream,
-        mode=mode,
-    ) for mode in ("rb", "wb+"))
 except ImportError:
     tensorizer = PlaceholderModule("tensorizer")
     DecryptionParams = tensorizer.placeholder_attr("DecryptionParams")
@@ -54,9 +50,6 @@ except ImportError:
     get_mem_usage = tensorizer.placeholder_attr("utils.get_mem_usage")
     no_init_or_tensor = tensorizer.placeholder_attr("utils.no_init_or_tensor")
 
-    _read_stream = tensorizer.placeholder_attr("_read_stream")
-    _write_stream = tensorizer.placeholder_attr("_write_stream")
-
 __all__ = [
     'EncryptionParams', 'DecryptionParams', 'TensorDeserializer',
     'TensorSerializer', 'open_stream', 'convert_bytes', 'get_mem_usage',
@@ -66,6 +59,23 @@ __all__ = [
 logger = init_logger(__name__)
 
 
+def is_valid_deserialization_uri(uri: Optional[str]) -> bool:
+    if uri:
+        scheme = uri.lower().split("://")[0]
+        return scheme in {"s3", "http", "https"} or os.path.exists(uri)
+    return False
+
+
+def tensorizer_kwargs_arg(value):
+    loaded = json.loads(value)
+    if not isinstance(loaded, dict):
+        raise argparse.ArgumentTypeError(
+            f"Not deserializable to dict: {value}. serialization_kwargs and "
+            f"deserialization_kwargs must be "
+            f"deserializable from a JSON string to a dictionary. ")
+    return loaded
+
+
 class MetaTensorMode(TorchDispatchMode):
 
     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
@@ -137,101 +147,45 @@ class _NoInitOrTensorImpl:
 
 
 @dataclass
-class TensorizerConfig:
-    tensorizer_uri: Union[str, None] = None
-    vllm_tensorized: Optional[bool] = False
-    verify_hash: Optional[bool] = False
+class TensorizerConfig(MutableMapping):
+    tensorizer_uri: Optional[str] = None
+    tensorizer_dir: Optional[str] = None
+    vllm_tensorized: Optional[bool] = None
+    verify_hash: Optional[bool] = None
     num_readers: Optional[int] = None
     encryption_keyfile: Optional[str] = None
     s3_access_key_id: Optional[str] = None
     s3_secret_access_key: Optional[str] = None
     s3_endpoint: Optional[str] = None
-    model_class: Optional[type[torch.nn.Module]] = None
-    hf_config: Optional[PretrainedConfig] = None
-    dtype: Optional[Union[str, torch.dtype]] = None
     lora_dir: Optional[str] = None
-    _is_sharded: bool = False
-
-    def __post_init__(self):
-        # check if the configuration is for a sharded vLLM model
-        self._is_sharded = isinstance(self.tensorizer_uri, str) \
-            and re.search(r'%0\dd', self.tensorizer_uri) is not None
-        if not self.tensorizer_uri and not self.lora_dir:
-            raise ValueError("tensorizer_uri must be provided.")
-        if not self.tensorizer_uri and self.lora_dir:
-            self.tensorizer_uri = f"{self.lora_dir}/adapter_model.tensors"
-        assert self.tensorizer_uri is not None, ("tensorizer_uri must be "
-                                                 "provided.")
-        self.tensorizer_dir = os.path.dirname(self.tensorizer_uri)
-        self.lora_dir = self.tensorizer_dir
-
-    @classmethod
-    def as_dict(cls, *args, **kwargs) -> dict[str, Any]:
-        cfg = TensorizerConfig(*args, **kwargs)
-        return dataclasses.asdict(cfg)
-
-    def to_dict(self) -> dict[str, Any]:
-        return dataclasses.asdict(self)
-
-    def _construct_tensorizer_args(self) -> "TensorizerArgs":
-        tensorizer_args = {
-            "tensorizer_uri": self.tensorizer_uri,
-            "vllm_tensorized": self.vllm_tensorized,
-            "verify_hash": self.verify_hash,
-            "num_readers": self.num_readers,
-            "encryption_keyfile": self.encryption_keyfile,
-            "s3_access_key_id": self.s3_access_key_id,
-            "s3_secret_access_key": self.s3_secret_access_key,
-            "s3_endpoint": self.s3_endpoint,
-        }
-        return TensorizerArgs(**tensorizer_args)  # type: ignore
-
-    def verify_with_parallel_config(
-        self,
-        parallel_config: "ParallelConfig",
-    ) -> None:
-        if parallel_config.tensor_parallel_size > 1 \
-            and not self._is_sharded:
-            raise ValueError(
-                "For a sharded model, tensorizer_uri should include a"
-                " string format template like '%04d' to be formatted"
-                " with the rank of the shard")
-
-    def verify_with_model_config(self, model_config: "ModelConfig") -> None:
-        if (model_config.quantization is not None
-                and self.tensorizer_uri is not None):
-            logger.warning(
-                "Loading a model using Tensorizer with quantization on vLLM"
-                " is unstable and may lead to errors.")
-
-    def open_stream(self, tensorizer_args: Optional["TensorizerArgs"] = None):
-        if tensorizer_args is None:
-            tensorizer_args = self._construct_tensorizer_args()
-
-        return open_stream(self.tensorizer_uri,
-                           **tensorizer_args.stream_params)
-
-
-@dataclass
-class TensorizerArgs:
-    tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, BinaryIO, str,
-                          bytes, os.PathLike, int]
-    vllm_tensorized: Optional[bool] = False
-    verify_hash: Optional[bool] = False
-    num_readers: Optional[int] = None
-    encryption_keyfile: Optional[str] = None
-    s3_access_key_id: Optional[str] = None
-    s3_secret_access_key: Optional[str] = None
-    s3_endpoint: Optional[str] = None
+    stream_kwargs: Optional[dict[str, Any]] = None
+    serialization_kwargs: Optional[dict[str, Any]] = None
+    deserialization_kwargs: Optional[dict[str, Any]] = None
+    _extra_serialization_attrs: Optional[dict[str, Any]] = field(init=False,
+                                                                 default=None)
+    model_class: Optional[type[torch.nn.Module]] = field(init=False,
+                                                         default=None)
+    hf_config: Optional[PretrainedConfig] = field(init=False, default=None)
+    dtype: Optional[Union[str, torch.dtype]] = field(init=False, default=None)
+    _is_sharded: bool = field(init=False, default=False)
+    _fields: ClassVar[tuple[str, ...]]
+    _keys: ClassVar[frozenset[str]]
     """
-  Args for the TensorizerAgent class. These are used to configure the behavior 
-  of the TensorDeserializer when loading tensors from a serialized model.
-  
-  Args:
+    Args for the TensorizerConfig class. These are used to configure the 
+    behavior of model serialization and deserialization using Tensorizer.
+    
+    Args:
       tensorizer_uri: Path to serialized model tensors. Can be a local file 
           path or a S3 URI. This is a required field unless lora_dir is 
           provided and the config is meant to be used for the
-          `tensorize_lora_adapter` function.
+          `tensorize_lora_adapter` function. Unless a `tensorizer_dir` or 
+          `lora_dir` is passed to this object's initializer, this is a required 
+          argument.
+      tensorizer_dir: Path to a directory containing serialized model tensors,
+          and all other potential model artifacts to load the model, such as 
+          configs and tokenizer files. Can be passed instead of `tensorizer_uri`
+          where the `model.tensors` file will be assumed to be in this 
+          directory.
       vllm_tensorized: If True, indicates that the serialized model is a 
           vLLM model. This is used to determine the behavior of the 
           TensorDeserializer when loading tensors from a serialized model.
@@ -256,34 +210,174 @@ class TensorizerArgs:
           be set via the S3_SECRET_ACCESS_KEY environment variable.
       s3_endpoint: The endpoint for the S3 bucket. Can also be set via the
           S3_ENDPOINT_URL environment variable.
+      lora_dir: Path to a directory containing LoRA adapter artifacts for 
+          serialization or deserialization. When serializing LoRA adapters 
+          this is the only necessary parameter to pass to this object's 
+          initializer.
   """
 
     def __post_init__(self):
-        self.file_obj = self.tensorizer_uri
-        self.s3_access_key_id = self.s3_access_key_id or envs.S3_ACCESS_KEY_ID
-        self.s3_secret_access_key = (self.s3_secret_access_key
+        # check if the configuration is for a sharded vLLM model
+        self._is_sharded = isinstance(self.tensorizer_uri, str) \
+            and re.search(r'%0\dd', self.tensorizer_uri) is not None
+
+        if self.tensorizer_dir and self.tensorizer_uri:
+            raise ValueError(
+                "Either tensorizer_dir or tensorizer_uri must be provided, "
+                "not both.")
+        if self.tensorizer_dir and self.lora_dir:
+            raise ValueError(
+                "Only one of tensorizer_dir or lora_dir may be specified. "
+                "Use lora_dir exclusively when serializing LoRA adapters, "
+                "and tensorizer_dir or tensorizer_uri otherwise.")
+        if not self.tensorizer_uri:
+            if self.lora_dir:
+                self.tensorizer_uri = f"{self.lora_dir}/adapter_model.tensors"
+            elif self.tensorizer_dir:
+                self.tensorizer_uri = f"{self.tensorizer_dir}/model.tensors"
+            else:
+                raise ValueError("Unable to resolve tensorizer_uri. "
+                                 "A valid tensorizer_uri or tensorizer_dir "
+                                 "must be provided for deserialization, and a "
+                                 "valid tensorizer_uri, tensorizer_uri, or "
+                                 "lora_dir for serialization.")
+        else:
+            self.tensorizer_dir = os.path.dirname(self.tensorizer_uri)
+
+        if not self.serialization_kwargs:
+            self.serialization_kwargs = {}
+        if not self.deserialization_kwargs:
+            self.deserialization_kwargs = {}
+
+    def to_serializable(self) -> dict[str, Any]:
+        # Due to TensorizerConfig needing to be msgpack-serializable, it needs
+        # support for morphing back and forth between itself and its dict
+        # representation
+
+        # TensorizerConfig's representation as a dictionary is meant to be
+        # linked to TensorizerConfig in such a way that the following is
+        # technically initializable:
+        # TensorizerConfig(**my_tensorizer_cfg.to_serializable())
+
+        # This means the dict must not retain non-initializable parameters
+        # and post-init attribute states
+
+        # Also don't want to retain private and unset parameters, so only retain
+        # not None values and public attributes
+
+        raw_tc_dict = asdict(self)
+        blacklisted = []
+
+        if "tensorizer_uri" in raw_tc_dict and "tensorizer_dir" in raw_tc_dict:
+            blacklisted.append("tensorizer_dir")
+
+        if "tensorizer_dir" in raw_tc_dict and "lora_dir" in raw_tc_dict:
+            blacklisted.append("tensorizer_dir")
+
+        tc_dict = {}
+        for k, v in raw_tc_dict.items():
+            if (k not in blacklisted and k not in tc_dict
+                    and not k.startswith("_") and v is not None):
+                tc_dict[k] = v
+
+        return tc_dict
+
+    def _construct_tensorizer_args(self) -> "TensorizerArgs":
+        return TensorizerArgs(self)  # type: ignore
+
+    def verify_with_parallel_config(
+        self,
+        parallel_config: "ParallelConfig",
+    ) -> None:
+        if parallel_config.tensor_parallel_size > 1 \
+            and not self._is_sharded:
+            raise ValueError(
+                "For a sharded model, tensorizer_uri should include a"
+                " string format template like '%04d' to be formatted"
+                " with the rank of the shard")
+
+    def verify_with_model_config(self, model_config: "ModelConfig") -> None:
+        if (model_config.quantization is not None
+                and self.tensorizer_uri is not None):
+            logger.warning(
+                "Loading a model using Tensorizer with quantization on vLLM"
+                " is unstable and may lead to errors.")
+
+    def open_stream(self, tensorizer_args: Optional["TensorizerArgs"] = None):
+        if tensorizer_args is None:
+            tensorizer_args = self._construct_tensorizer_args()
+
+        return open_stream(self.tensorizer_uri,
+                           **tensorizer_args.stream_kwargs)
+
+    def keys(self):
+        return self._keys
+
+    def __len__(self):
+        return len(fields(self))
+
+    def __iter__(self):
+        return iter(self._fields)
+
+    def __getitem__(self, item: str) -> Any:
+        if item not in self.keys():
+            raise KeyError(item)
+        return getattr(self, item)
+
+    def __setitem__(self, key: str, value: Any) -> None:
+        if key not in self.keys():
+            # Disallow modifying invalid keys
+            raise KeyError(key)
+        setattr(self, key, value)
+
+    def __delitem__(self, key, /):
+        if key not in self.keys():
+            raise KeyError(key)
+        delattr(self, key)
+
+
+TensorizerConfig._fields = tuple(f.name for f in fields(TensorizerConfig))
+TensorizerConfig._keys = frozenset(TensorizerConfig._fields)
+
+
+@dataclass
+class TensorizerArgs:
+    tensorizer_uri: Optional[str] = None
+    tensorizer_dir: Optional[str] = None
+    encryption_keyfile: Optional[str] = None
+
+    def __init__(self, tensorizer_config: TensorizerConfig):
+        for k, v in tensorizer_config.items():
+            setattr(self, k, v)
+        self.file_obj = tensorizer_config.tensorizer_uri
+        self.s3_access_key_id = (tensorizer_config.s3_access_key_id
+                                 or envs.S3_ACCESS_KEY_ID)
+        self.s3_secret_access_key = (tensorizer_config.s3_secret_access_key
                                      or envs.S3_SECRET_ACCESS_KEY)
-        self.s3_endpoint = self.s3_endpoint or envs.S3_ENDPOINT_URL
-        self.stream_params = {
-            "s3_access_key_id": self.s3_access_key_id,
-            "s3_secret_access_key": self.s3_secret_access_key,
-            "s3_endpoint": self.s3_endpoint,
+        self.s3_endpoint = tensorizer_config.s3_endpoint or envs.S3_ENDPOINT_URL
+
+        self.stream_kwargs = {
+            "s3_access_key_id": tensorizer_config.s3_access_key_id,
+            "s3_secret_access_key": tensorizer_config.s3_secret_access_key,
+            "s3_endpoint": tensorizer_config.s3_endpoint,
+            **(tensorizer_config.stream_kwargs or {})
         }
 
-        self.deserializer_params = {
-            "verify_hash": self.verify_hash,
-            "encryption": self.encryption_keyfile,
-            "num_readers": self.num_readers
+        self.deserialization_kwargs = {
+            "verify_hash": tensorizer_config.verify_hash,
+            "encryption": tensorizer_config.encryption_keyfile,
+            "num_readers": tensorizer_config.num_readers,
+            **(tensorizer_config.deserialization_kwargs or {})
         }
 
         if self.encryption_keyfile:
             with open_stream(
-                    self.encryption_keyfile,
-                    **self.stream_params,
+                    tensorizer_config.encryption_keyfile,
+                    **self.stream_kwargs,
             ) as stream:
                 key = stream.read()
                 decryption_params = DecryptionParams.from_key(key)
-                self.deserializer_params['encryption'] = decryption_params
+                self.deserialization_kwargs['encryption'] = decryption_params
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
@@ -405,15 +499,22 @@ def init_tensorizer_model(tensorizer_config: TensorizerConfig,
 def deserialize_tensorizer_model(model: nn.Module,
                                  tensorizer_config: TensorizerConfig) -> None:
     tensorizer_args = tensorizer_config._construct_tensorizer_args()
+    if not is_valid_deserialization_uri(tensorizer_config.tensorizer_uri):
+        raise ValueError(
+            f"{tensorizer_config.tensorizer_uri} is not a valid "
+            f"tensorizer URI. Please check that the URI is correct. "
+            f"It must either point to a local existing file, or have a "
+            f"S3, HTTP or HTTPS scheme.")
     before_mem = get_mem_usage()
     start = time.perf_counter()
-    with _read_stream(
+    with open_stream(
             tensorizer_config.tensorizer_uri,
-            **tensorizer_args.stream_params) as stream, TensorDeserializer(
+            mode="rb",
+            **tensorizer_args.stream_kwargs) as stream, TensorDeserializer(
                 stream,
                 dtype=tensorizer_config.dtype,
-                device=f'cuda:{torch.cuda.current_device()}',
-                **tensorizer_args.deserializer_params) as deserializer:
+                device=torch.device("cuda", torch.cuda.current_device()),
+                **tensorizer_args.deserialization_kwargs) as deserializer:
         deserializer.load_into_module(model)
         end = time.perf_counter()
 
@@ -442,9 +543,9 @@ def tensorizer_weights_iterator(
                    "examples/others/tensorize_vllm_model.py example script "
                    "for serializing vLLM models.")
 
-    deserializer_args = tensorizer_args.deserializer_params
-    stream_params = tensorizer_args.stream_params
-    stream = open_stream(tensorizer_args.tensorizer_uri, **stream_params)
+    deserializer_args = tensorizer_args.deserialization_kwargs
+    stream_kwargs = tensorizer_args.stream_kwargs
+    stream = open_stream(tensorizer_args.tensorizer_uri, **stream_kwargs)
     with TensorDeserializer(stream, **deserializer_args,
                             device="cpu") as state:
         yield from state.items()
@@ -465,8 +566,8 @@ def is_vllm_tensorized(tensorizer_config: "TensorizerConfig") -> bool:
     """
     tensorizer_args = tensorizer_config._construct_tensorizer_args()
     deserializer = TensorDeserializer(open_stream(
-        tensorizer_args.tensorizer_uri, **tensorizer_args.stream_params),
-                                      **tensorizer_args.deserializer_params,
+        tensorizer_args.tensorizer_uri, **tensorizer_args.stream_kwargs),
+                                      **tensorizer_args.deserialization_kwargs,
                                       lazy_load=True)
     if tensorizer_config.vllm_tensorized:
         logger.warning(
@@ -477,13 +578,41 @@ def is_vllm_tensorized(tensorizer_config: "TensorizerConfig") -> bool:
     return ".vllm_tensorized_marker" in deserializer
 
 
+def serialize_extra_artifacts(
+        tensorizer_args: TensorizerArgs,
+        served_model_name: Union[str, list[str], None]) -> None:
+    if not isinstance(served_model_name, str):
+        raise ValueError(
+            f"served_model_name must be a str for serialize_extra_artifacts, "
+            f"not {type(served_model_name)}.")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        snapshot_download(served_model_name,
+                          local_dir=tmpdir,
+                          ignore_patterns=[
+                              "*.pt", "*.safetensors", "*.bin", "*.cache",
+                              "*.gitattributes", "*.md"
+                          ])
+        for artifact in os.scandir(tmpdir):
+            if not artifact.is_file():
+                continue
+            with open(artifact.path, "rb") as f, open_stream(
+                    f"{tensorizer_args.tensorizer_dir}/{artifact.name}",
+                    mode="wb+",
+                    **tensorizer_args.stream_kwargs) as stream:
+                logger.info("Writing artifact %s", artifact.name)
+                stream.write(f.read())
+
+
 def serialize_vllm_model(
     model: nn.Module,
     tensorizer_config: TensorizerConfig,
+    model_config: "ModelConfig",
 ) -> nn.Module:
     model.register_parameter(
         "vllm_tensorized_marker",
         nn.Parameter(torch.tensor((1, ), device="meta"), requires_grad=False))
+
     tensorizer_args = tensorizer_config._construct_tensorizer_args()
 
     encryption_params = None
@@ -497,10 +626,16 @@ def serialize_vllm_model(
         from vllm.distributed import get_tensor_model_parallel_rank
         output_file = output_file % get_tensor_model_parallel_rank()
 
-    with _write_stream(output_file, **tensorizer_args.stream_params) as stream:
-        serializer = TensorSerializer(stream, encryption=encryption_params)
+    with open_stream(output_file, mode="wb+",
+                     **tensorizer_args.stream_kwargs) as stream:
+        serializer = TensorSerializer(stream,
+                                      encryption=encryption_params,
+                                      **tensorizer_config.serialization_kwargs)
         serializer.write_module(model)
         serializer.close()
+
+    serialize_extra_artifacts(tensorizer_args, model_config.served_model_name)
+
     logger.info("Successfully serialized model to %s", str(output_file))
     return model
 
@@ -522,8 +657,9 @@ def tensorize_vllm_model(engine_args: "EngineArgs",
     if generate_keyfile and (keyfile :=
                              tensorizer_config.encryption_keyfile) is not None:
         encryption_params = EncryptionParams.random()
-        with _write_stream(
+        with open_stream(
                 keyfile,
+                mode="wb+",
                 s3_access_key_id=tensorizer_config.s3_access_key_id,
                 s3_secret_access_key=tensorizer_config.s3_secret_access_key,
                 s3_endpoint=tensorizer_config.s3_endpoint,
@@ -537,13 +673,13 @@ def tensorize_vllm_model(engine_args: "EngineArgs",
         engine = LLMEngine.from_engine_args(engine_args)
         engine.model_executor.collective_rpc(
             "save_tensorized_model",
-            kwargs=dict(tensorizer_config=tensorizer_config),
+            kwargs={"tensorizer_config": tensorizer_config.to_serializable()},
         )
     else:
         engine = V1LLMEngine.from_vllm_config(engine_config)
         engine.collective_rpc(
             "save_tensorized_model",
-            kwargs=dict(tensorizer_config=tensorizer_config),
+            kwargs={"tensorizer_config": tensorizer_config.to_serializable()},
         )
 
 
@@ -586,14 +722,14 @@ def tensorize_lora_adapter(lora_path: str,
 
     with open_stream(f"{tensorizer_config.lora_dir}/adapter_config.json",
                      mode="wb+",
-                     **tensorizer_args.stream_params) as f:
+                     **tensorizer_args.stream_kwargs) as f:
 
         f.write(json.dumps(config).encode("utf-8"))
 
     lora_uri = (f"{tensorizer_config.lora_dir}"
                 f"/adapter_model.tensors")
     with open_stream(lora_uri, mode="wb+",
-                     **tensorizer_args.stream_params) as f:
+                     **tensorizer_args.stream_kwargs) as f:
         serializer = TensorSerializer(f)
         serializer.write_state_dict(tensors)
         serializer.close()
diff --git a/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py
index 0b62e744e445c..9ecc318938e48 100644
--- a/vllm/model_executor/model_loader/tensorizer_loader.py
+++ b/vllm/model_executor/model_loader/tensorizer_loader.py
@@ -20,6 +20,18 @@ from vllm.model_executor.model_loader.utils import (get_model_architecture,
 
 logger = init_logger(__name__)
 
+BLACKLISTED_TENSORIZER_ARGS = {
+    "device",  # vLLM decides this
+    "dtype",  # vLLM decides this
+    "mode",  # Not meant to be configurable by the user
+}
+
+
+def validate_config(config: dict):
+    for k, v in config.items():
+        if v is not None and k in BLACKLISTED_TENSORIZER_ARGS:
+            raise ValueError(f"{k} is not an allowed Tensorizer argument.")
+
 
 class TensorizerLoader(BaseModelLoader):
     """Model loader using CoreWeave's tensorizer library."""
@@ -29,6 +41,7 @@ class TensorizerLoader(BaseModelLoader):
         if isinstance(load_config.model_loader_extra_config, TensorizerConfig):
             self.tensorizer_config = load_config.model_loader_extra_config
         else:
+            validate_config(load_config.model_loader_extra_config)
             self.tensorizer_config = TensorizerConfig(
                 **load_config.model_loader_extra_config)
 
@@ -118,10 +131,12 @@ class TensorizerLoader(BaseModelLoader):
     def save_model(
         model: torch.nn.Module,
         tensorizer_config: Union[TensorizerConfig, dict],
+        model_config: ModelConfig,
     ) -> None:
         if isinstance(tensorizer_config, dict):
             tensorizer_config = TensorizerConfig(**tensorizer_config)
         serialize_vllm_model(
             model=model,
             tensorizer_config=tensorizer_config,
+            model_config=model_config,
         )
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 5a26e88db1f77..8658d7d916f00 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1820,6 +1820,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         TensorizerLoader.save_model(
             self.model,
             tensorizer_config=tensorizer_config,
+            model_config=self.model_config,
         )
 
     def _get_prompt_logprobs_dict(
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 82db6617ba55f..9d936f3dbf01c 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1246,6 +1246,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
         TensorizerLoader.save_model(
             self.model,
             tensorizer_config=tensorizer_config,
+            model_config=self.model_config,
         )
 
     def get_max_block_per_batch(self) -> int:

From 71d1d75b7abb0e2dfa70443e0c3f111de0a6be22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Tue, 8 Jul 2025 09:56:40 +0200
Subject: [PATCH 159/167] [PD][Nixl] Remote consumer READ timeout for clearing
 request blocks  (#20139)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 .../kv_connector/unit/test_nixl_connector.py  | 78 ++++++++++++++++++-
 .../kv_connector/v1/nixl_connector.py         | 37 +++++++--
 vllm/envs.py                                  | 10 ++-
 3 files changed, 115 insertions(+), 10 deletions(-)

diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index e30a250449aaa..e18c4975a3221 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -9,10 +9,13 @@ from unittest.mock import patch
 
 import pytest
 
+from vllm import LLM
+from vllm.config import KVTransferConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
     KVConnectorRole, NixlAgentMetadata, NixlConnector, NixlConnectorMetadata,
     NixlConnectorWorker)
 from vllm.forward_context import ForwardContext
+from vllm.sampling_params import SamplingParams
 
 from .utils import create_request, create_scheduler, create_vllm_config
 
@@ -41,9 +44,9 @@ def test_basic_interface():
     assert kv_connector_metadata is not None
     assert isinstance(kv_connector_metadata, NixlConnectorMetadata)
 
-    assert len(kv_connector_metadata.requests) == 1
-    assert request_id in kv_connector_metadata.requests
-    req_meta = kv_connector_metadata.requests[request_id]
+    assert len(kv_connector_metadata.reqs_to_recv) == 1
+    assert request_id in kv_connector_metadata.reqs_to_recv
+    req_meta = kv_connector_metadata.reqs_to_recv[request_id]
 
     for block_id, block in zip(
             req_meta.local_block_ids, scheduler.kv_cache_manager.coordinator.
@@ -78,7 +81,7 @@ def test_prompt_less_than_block_size():
     kv_connector_metadata = scheduler_output.kv_connector_metadata
     assert kv_connector_metadata is not None
     assert isinstance(kv_connector_metadata, NixlConnectorMetadata)
-    assert len(kv_connector_metadata.requests) == 0
+    assert len(kv_connector_metadata.reqs_to_recv) == 0
 
     # This request should be scheduled regularly.
     assert len(scheduler_output.scheduled_new_reqs) == 1
@@ -371,3 +374,70 @@ class TestNixlHandshake:
                 if cnt_finished_reqs == total_reqs:
                     return
         raise TimeoutError("Took too long to complete async handshake.")
+
+
+@patch(
+    "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+    FakeNixlWrapper)
+def test_abort_timeout_on_prefiller(monkeypatch):
+    """
+    Test lifecycle of an aborted Remote Prefill request hitting the timeout.
+    -----> P 
+            |  {process request}
+     <-\--- |  {result is NOT delivered, eg proxy is down}
+            |
+            |
+            |  {eventually free blocks}
+    """
+    model_name = "Qwen/Qwen3-0.6B"
+    kv_transfer_config = KVTransferConfig(
+        kv_connector="NixlConnector",
+        kv_role="kv_both",
+    )
+    timeout = 6
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+    monkeypatch.setenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", str(timeout))
+    llm = LLM(
+        model=model_name,
+        enforce_eager=True,
+        gpu_memory_utilization=0.5,
+        kv_transfer_config=kv_transfer_config,
+    )
+    remote_prefill_opts = {
+        "do_remote_decode": True,
+        "do_remote_prefill": False,
+        "remote_engine_id": None,
+        "remote_block_ids": None,
+        "remote_host": None,
+        "remote_port": None,
+    }
+    # Simulate sidecar request
+    sampling_params = SamplingParams(
+        temperature=0.0,
+        max_tokens=1,
+        extra_args={"kv_transfer_params": remote_prefill_opts})
+    scheduler = llm.llm_engine.engine_core.engine_core.scheduler
+    req_to_blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
+        0].req_to_blocks
+
+    padding = "Just making this request a little longer so that we're sure "
+    "we're not hitting the small-request lower bound beneath which we don't "
+    "actually trigger the whole kv transfer, but rather just recompute the "
+    "blocks on D."
+    _ = llm.generate([f"What is the capital of Japan? {padding}"],
+                     sampling_params)
+
+    # Request finished but not freed
+    assert '0' in scheduler.finished_req_ids and '0' in req_to_blocks
+    # Some other request, 0 still not freed
+    _ = llm.generate([f"What is the capital of Italy? {padding}"],
+                     sampling_params)
+    assert '0' in req_to_blocks
+    assert '1' in scheduler.finished_req_ids and '1' in req_to_blocks
+
+    # Wait for timeout and trigger another scheduler loop
+    time.sleep(timeout)
+    _ = llm.generate([f"What is the capital of France? {padding}"],
+                     sampling_params)
+    # Request-0 times out and is cleared!
+    assert '0' not in req_to_blocks
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 56ae1acf8571f..67adb3e8a3c90 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -79,7 +79,8 @@ class ReqMeta:
 class NixlConnectorMetadata(KVConnectorMetadata):
 
     def __init__(self):
-        self.requests: dict[ReqId, ReqMeta] = {}
+        self.reqs_to_recv: dict[ReqId, ReqMeta] = {}
+        self.reqs_to_send: dict[ReqId, float] = {}
 
     def add_new_req(
         self,
@@ -87,7 +88,7 @@ class NixlConnectorMetadata(KVConnectorMetadata):
         local_block_ids: list[int],
         kv_transfer_params: dict[str, Any],
     ):
-        self.requests[request_id] = ReqMeta(
+        self.reqs_to_recv[request_id] = ReqMeta(
             local_block_ids=local_block_ids,
             remote_block_ids=kv_transfer_params["remote_block_ids"],
             remote_engine_id=kv_transfer_params["remote_engine_id"],
@@ -194,10 +195,12 @@ class NixlConnectorScheduler:
             vllm_config.parallel_config.tensor_parallel_size)
         logger.info("Initializing NIXL Scheduler %s", engine_id)
 
-        # Requests that need to start recv.
+        # Requests that need to start recv/send.
         # New requests are added by update_state_after_alloc in
         # the scheduler. Used to make metadata passed to Worker.
         self._reqs_need_recv: dict[ReqId, tuple[Request, list[int]]] = {}
+        # Reqs to send and their expiration time
+        self._reqs_need_send: dict[ReqId, float] = {}
 
     def get_num_new_matched_tokens(
             self, request: "Request",
@@ -284,6 +287,9 @@ class NixlConnectorScheduler:
         # Clear the list once workers start the transfers
         self._reqs_need_recv.clear()
 
+        meta.reqs_to_send = self._reqs_need_send
+        self._reqs_need_send = {}
+
         return meta
 
     def request_finished(
@@ -325,6 +331,11 @@ class NixlConnectorScheduler:
         # If prompt < block_size, no xfer so free blocks immediately.
         delay_free_blocks = len(computed_block_ids) > 0
 
+        if delay_free_blocks:
+            # Prefill request on remote. It will be read from D upon completion
+            self._reqs_need_send[request.request_id] = time.perf_counter(
+            ) + envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT
+
         return delay_free_blocks, dict(
             do_remote_prefill=True,
             do_remote_decode=False,
@@ -394,6 +405,8 @@ class NixlConnectorWorker:
         # In progress transfers.
         # [req_id -> list[handle]]
         self._recving_transfers = defaultdict[ReqId, list[Transfer]](list)
+        # Track the expiration time of requests that are waiting to be sent.
+        self._reqs_to_send: dict[ReqId, float] = {}
 
         # Complete transfer tracker. Used by the rank 0 to track finished
         # transactions on ranks 1 to N-1.
@@ -826,6 +839,16 @@ class NixlConnectorWorker:
                 "and %s requests done recving", self.tp_rank,
                 len(done_sending), len(done_recving))
 
+        # Handle timeout to avoid stranding blocks on remote.
+        now = time.perf_counter()
+        while self._reqs_to_send:
+            req_id, expires = next(iter(self._reqs_to_send.items()))
+            # Sorted dict, oldest requests are put first so we can exit early.
+            if now < expires:
+                break
+            del self._reqs_to_send[req_id]
+            done_sending.add(req_id)
+
         if self.world_size == 1:
             return done_sending, done_recving
 
@@ -857,7 +880,7 @@ class NixlConnectorWorker:
 
             all_done_sending: set[str] = set()
             for req_id in list(self._done_sending_count.keys()):
-                if self._done_sending_count[req_id] == self.world_size:
+                if self._done_sending_count[req_id] >= self.world_size:
                     del self._done_sending_count[req_id]
                     all_done_sending.add(req_id)
 
@@ -887,6 +910,7 @@ class NixlConnectorWorker:
                         tp_ratio):
                     notified_req_ids.add(req_id)
                     del self.consumer_notification_counts_by_req[req_id]
+                    del self._reqs_to_send[req_id]
         return notified_req_ids
 
     def _pop_done_transfers(
@@ -921,7 +945,7 @@ class NixlConnectorWorker:
         Start loading by triggering non-blocking nixl_xfer.
         We check for these trnxs to complete in each step().
         """
-        for req_id, meta in metadata.requests.items():
+        for req_id, meta in metadata.reqs_to_recv.items():
             remote_engine_id = meta.remote_engine_id
             logger.debug(
                 "start_load_kv for request %s from remote engine %s. "
@@ -943,6 +967,9 @@ class NixlConnectorWorker:
         while not self._ready_requests.empty():
             self._read_blocks_for_req(*self._ready_requests.get_nowait())
 
+        # Add to requests that are waiting to be read and track expiration.
+        self._reqs_to_send.update(metadata.reqs_to_send)
+
     def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
         logger.debug(
             "Remote agent %s available, calling _read_blocks for req %s",
diff --git a/vllm/envs.py b/vllm/envs.py
index 0cc6792d72bbd..ec6a4896774f8 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -138,6 +138,7 @@ if TYPE_CHECKING:
     VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: str = "NONE"
     VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
     VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None
+    VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
 
 
 def get_default_cache_root():
@@ -953,7 +954,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # generations on machines < 100 for compressed-tensors
     # models
     "VLLM_USE_NVFP4_CT_EMULATIONS":
-    lambda: bool(int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0")))
+    lambda: bool(int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0"))),
+
+    # Time (in seconds) after which the KV cache on the producer side is
+    # automatically cleared if no READ notification is received from the
+    # consumer. This is only applicable when using NixlConnector in a
+    # disaggregated decode-prefill setup.
+    "VLLM_NIXL_ABORT_REQUEST_TIMEOUT":
+    lambda: int(os.getenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", "120"))
 }
 
 # --8<-- [end:env-vars-definition]

From b91cb3fa5c40993a1e56ffb6915db9ffebf9aa0a Mon Sep 17 00:00:00 2001
From: Ricardo Decal <crypdick@users.noreply.github.com>
Date: Tue, 8 Jul 2025 02:09:06 -0700
Subject: [PATCH 160/167] [Docs] Improve documentation for Deepseek R1 on Ray
 Serve LLM (#20601)

Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
---
 examples/online_serving/ray_serve_deepseek.py | 31 ++++++++++++-------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/examples/online_serving/ray_serve_deepseek.py b/examples/online_serving/ray_serve_deepseek.py
index 9471563ddb76b..d24b553df27c7 100644
--- a/examples/online_serving/ray_serve_deepseek.py
+++ b/examples/online_serving/ray_serve_deepseek.py
@@ -1,13 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
-Example to deploy DeepSeek R1 or V3 with Ray Serve LLM.
-See more details at:
-https://docs.ray.io/en/latest/serve/tutorials/serve-deepseek.html
-And see Ray Serve LLM documentation at:
-https://docs.ray.io/en/latest/serve/llm/serving-llms.html
+Deploy DeepSeek R1 or V3 with Ray Serve LLM.
 
-Run `python3 ray_serve_deepseek.py` to deploy the model.
+Ray Serve LLM is a scalable and production-grade model serving library built
+on the Ray distributed computing framework and first-class support for the vLLM engine.
+
+Key features:
+- Automatic scaling, back-pressure, and load balancing across a Ray cluster.
+- Unified multi-node multi-model deployment.
+- Exposes an OpenAI-compatible HTTP API.
+- Multi-LoRA support with shared base models.
+
+Run `python3 ray_serve_deepseek.py` to launch an endpoint.
+
+Learn more in the official Ray Serve LLM documentation:
+https://docs.ray.io/en/latest/serve/llm/serving-llms.html
 """
 
 from ray import serve
@@ -16,9 +24,8 @@ from ray.serve.llm import LLMConfig, build_openai_app
 llm_config = LLMConfig(
     model_loading_config={
         "model_id": "deepseek",
-        # Since DeepSeek model is huge, it is recommended to pre-download
-        # the model to local disk, say /path/to/the/model and specify:
-        # model_source="/path/to/the/model"
+        # Pre-downloading the model to local storage is recommended since
+        # the model is large. Set model_source="/path/to/the/model".
         "model_source": "deepseek-ai/DeepSeek-R1",
     },
     deployment_config={
@@ -27,10 +34,10 @@ llm_config = LLMConfig(
             "max_replicas": 1,
         }
     },
-    # Change to the accelerator type of the node
+    # Set to the node's accelerator type.
     accelerator_type="H100",
     runtime_env={"env_vars": {"VLLM_USE_V1": "1"}},
-    # Customize engine arguments as needed (e.g. vLLM engine kwargs)
+    # Customize engine arguments as required (for example, vLLM engine kwargs).
     engine_kwargs={
         "tensor_parallel_size": 8,
         "pipeline_parallel_size": 2,
@@ -44,6 +51,6 @@ llm_config = LLMConfig(
     },
 )
 
-# Deploy the application
+# Deploy the application.
 llm_app = build_openai_app({"llm_configs": [llm_config]})
 serve.run(llm_app)

From b4bab81660a184693543ca9261ced745db1fc2a7 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 8 Jul 2025 10:49:13 +0100
Subject: [PATCH 161/167] Remove unnecessary explicit title anchors and use
 relative links instead (#20620)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/README.md                                |  2 +-
 docs/api/README.md                            |  2 +-
 docs/community/contact_us.md                  |  1 -
 docs/community/meetups.md                     |  1 -
 docs/configuration/conserving_memory.md       |  2 +-
 docs/configuration/engine_args.md             |  5 ++--
 docs/configuration/model_resolution.md        |  2 +-
 docs/configuration/serve_args.md              |  3 +-
 docs/contributing/benchmarks.md               |  1 -
 docs/contributing/dockerfile/dockerfile.md    |  2 +-
 docs/contributing/model/README.md             |  3 +-
 docs/contributing/model/basic.md              |  3 +-
 docs/contributing/model/multimodal.md         |  9 +++---
 docs/contributing/model/registration.md       | 11 ++++---
 docs/contributing/model/tests.md              |  1 -
 docs/deployment/docker.md                     |  3 +-
 docs/deployment/frameworks/anything-llm.md    |  1 -
 docs/deployment/frameworks/autogen.md         |  1 -
 docs/deployment/frameworks/bentoml.md         |  1 -
 docs/deployment/frameworks/cerebrium.md       |  1 -
 docs/deployment/frameworks/chatbox.md         |  1 -
 docs/deployment/frameworks/dify.md            |  1 -
 docs/deployment/frameworks/dstack.md          |  1 -
 docs/deployment/frameworks/haystack.md        |  1 -
 docs/deployment/frameworks/helm.md            |  1 -
 docs/deployment/frameworks/litellm.md         |  1 -
 docs/deployment/frameworks/lobe-chat.md       |  1 -
 docs/deployment/frameworks/lws.md             |  1 -
 docs/deployment/frameworks/modal.md           |  1 -
 docs/deployment/frameworks/open-webui.md      |  1 -
 .../retrieval_augmented_generation.md         |  1 -
 docs/deployment/frameworks/skypilot.md        |  1 -
 docs/deployment/frameworks/streamlit.md       |  1 -
 docs/deployment/frameworks/triton.md          |  1 -
 docs/deployment/integrations/kserve.md        |  1 -
 docs/deployment/integrations/kubeai.md        |  1 -
 docs/deployment/integrations/llamastack.md    |  1 -
 docs/deployment/integrations/llmaz.md         |  1 -
 .../integrations/production-stack.md          |  1 -
 docs/deployment/k8s.md                        |  1 -
 docs/deployment/nginx.md                      |  1 -
 docs/design/arch_overview.md                  |  5 ++--
 docs/design/automatic_prefix_caching.md       |  1 -
 docs/design/huggingface_integration.md        |  1 -
 docs/design/kernel/paged_attention.md         |  1 -
 docs/design/mm_processing.md                  |  3 +-
 docs/design/plugin_system.md                  |  3 +-
 docs/features/automatic_prefix_caching.md     |  3 +-
 docs/features/compatibility_matrix.md         | 15 +++++-----
 docs/features/disagg_prefill.md               |  1 -
 docs/features/lora.md                         |  1 -
 docs/features/multimodal_inputs.md            |  1 -
 docs/features/quantization/README.md          |  1 -
 docs/features/quantization/auto_awq.md        |  1 -
 docs/features/quantization/bitblas.md         |  1 -
 docs/features/quantization/bnb.md             |  1 -
 docs/features/quantization/fp8.md             |  1 -
 docs/features/quantization/gguf.md            |  1 -
 docs/features/quantization/gptqmodel.md       |  1 -
 docs/features/quantization/int4.md            |  1 -
 docs/features/quantization/int8.md            |  1 -
 .../quantization/quantized_kvcache.md         |  1 -
 docs/features/quantization/quark.md           |  1 -
 .../quantization/supported_hardware.md        |  1 -
 docs/features/reasoning_outputs.md            |  1 -
 docs/features/spec_decode.md                  |  5 ++--
 docs/features/structured_outputs.md           |  3 +-
 docs/getting_started/installation/README.md   |  1 -
 .../installation/intel_gaudi.md               |  4 +--
 docs/getting_started/quickstart.md            |  5 ++--
 docs/mkdocs/hooks/generate_examples.py        | 13 ++++++---
 .../models/extensions/runai_model_streamer.md |  1 -
 docs/models/extensions/tensorizer.md          |  1 -
 docs/models/generative_models.md              |  5 ++--
 docs/models/hardware_supported_models/tpu.md  |  1 -
 docs/models/pooling_models.md                 |  7 ++---
 docs/models/supported_models.md               | 29 +++++++++----------
 docs/serving/distributed_serving.md           |  1 -
 docs/serving/integrations/langchain.md        |  1 -
 docs/serving/integrations/llamaindex.md       |  1 -
 docs/serving/offline_inference.md             |  5 ++--
 docs/serving/openai_compatible_server.md      |  5 ++--
 docs/usage/faq.md                             |  3 +-
 docs/usage/metrics.md                         |  2 +-
 docs/usage/troubleshooting.md                 |  3 +-
 docs/usage/v1_guide.md                        |  2 +-
 86 files changed, 75 insertions(+), 147 deletions(-)

diff --git a/docs/README.md b/docs/README.md
index e1d1046951a59..3483567f1a2c3 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -48,4 +48,4 @@ For more information, check out the following:
 - [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention)
 - [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023)
 - [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al.
-- [vLLM Meetups][meetups]
+- [vLLM Meetups](community/meetups.md)
diff --git a/docs/api/README.md b/docs/api/README.md
index 5c7b2ca79ee2c..2b5142e0bcd08 100644
--- a/docs/api/README.md
+++ b/docs/api/README.md
@@ -64,7 +64,7 @@ vLLM provides experimental support for multi-modal models through the [vllm.mult
 Multi-modal inputs can be passed alongside text and token prompts to [supported models][supported-mm-models]
 via the `multi_modal_data` field in [vllm.inputs.PromptType][].
 
-Looking to add your own multi-modal model? Please follow the instructions listed [here][supports-multimodal].
+Looking to add your own multi-modal model? Please follow the instructions listed [here](../contributing/model/multimodal.md).
 
 - [vllm.multimodal.MULTIMODAL_REGISTRY][]
 
diff --git a/docs/community/contact_us.md b/docs/community/contact_us.md
index a10e6bfc9b0a4..f26e312b64e70 100644
--- a/docs/community/contact_us.md
+++ b/docs/community/contact_us.md
@@ -1,6 +1,5 @@
 ---
 title: Contact Us
 ---
-[](){ #contactus }
 
 --8<-- "README.md:contact-us"
diff --git a/docs/community/meetups.md b/docs/community/meetups.md
index 8ea42e3cad185..89de4574d79e4 100644
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@@ -1,7 +1,6 @@
 ---
 title: Meetups
 ---
-[](){ #meetups }
 
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md
index 2b09498f79007..4d5c961af98fd 100644
--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@@ -33,7 +33,7 @@ Quantized models take less memory at the cost of lower precision.
 Statically quantized models can be downloaded from HF Hub (some popular ones are available at [Red Hat AI](https://huggingface.co/RedHatAI))
 and used directly without extra configuration.
 
-Dynamic quantization is also supported via the `quantization` option -- see [here][quantization-index] for more details.
+Dynamic quantization is also supported via the `quantization` option -- see [here](../features/quantization/README.md) for more details.
 
 ## Context length and batch size
 
diff --git a/docs/configuration/engine_args.md b/docs/configuration/engine_args.md
index e02c7090d373d..579a4731cacae 100644
--- a/docs/configuration/engine_args.md
+++ b/docs/configuration/engine_args.md
@@ -1,12 +1,11 @@
 ---
 title: Engine Arguments
 ---
-[](){ #engine-args }
 
 Engine arguments control the behavior of the vLLM engine.
 
-- For [offline inference][offline-inference], they are part of the arguments to [LLM][vllm.LLM] class.
-- For [online serving][serving-openai-compatible-server], they are part of the arguments to `vllm serve`.
+- For [offline inference](../serving/offline_inference.md), they are part of the arguments to [LLM][vllm.LLM] class.
+- For [online serving](../serving/openai_compatible_server.md), they are part of the arguments to `vllm serve`.
 
 You can look at [EngineArgs][vllm.engine.arg_utils.EngineArgs] and [AsyncEngineArgs][vllm.engine.arg_utils.AsyncEngineArgs] to see the available engine arguments.
 
diff --git a/docs/configuration/model_resolution.md b/docs/configuration/model_resolution.md
index 8757c257d3e93..d98142a835c76 100644
--- a/docs/configuration/model_resolution.md
+++ b/docs/configuration/model_resolution.md
@@ -20,4 +20,4 @@ model = LLM(
 )
 ```
 
-Our [list of supported models][supported-models] shows the model architectures that are recognized by vLLM.
+Our [list of supported models](../models/supported_models.md) shows the model architectures that are recognized by vLLM.
diff --git a/docs/configuration/serve_args.md b/docs/configuration/serve_args.md
index 16b4b29f45d98..4a7d771c5b8f1 100644
--- a/docs/configuration/serve_args.md
+++ b/docs/configuration/serve_args.md
@@ -1,7 +1,6 @@
 ---
 title: Server Arguments
 ---
-[](){ #serve-args }
 
 The `vllm serve` command is used to launch the OpenAI-compatible server.
 
@@ -13,7 +12,7 @@ To see the available CLI arguments, run `vllm serve --help`!
 ## Configuration file
 
 You can load CLI arguments via a [YAML](https://yaml.org/) config file.
-The argument names must be the long form of those outlined [above][serve-args].
+The argument names must be the long form of those outlined [above](serve_args.md).
 
 For example:
 
diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md
index 00505fc6f2a98..d0fbfa13cb94a 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@@ -1,7 +1,6 @@
 ---
 title: Benchmark Suites
 ---
-[](){ #benchmarks }
 
 vLLM contains two sets of benchmarks:
 
diff --git a/docs/contributing/dockerfile/dockerfile.md b/docs/contributing/dockerfile/dockerfile.md
index a39f335c87b87..a7ff99aa26d54 100644
--- a/docs/contributing/dockerfile/dockerfile.md
+++ b/docs/contributing/dockerfile/dockerfile.md
@@ -1,7 +1,7 @@
 # Dockerfile
 
 We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
-More information about deploying with Docker can be found [here][deployment-docker].
+More information about deploying with Docker can be found [here](../../deployment/docker.md).
 
 Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
 
diff --git a/docs/contributing/model/README.md b/docs/contributing/model/README.md
index 63abb7991050d..dd0e3e701d50b 100644
--- a/docs/contributing/model/README.md
+++ b/docs/contributing/model/README.md
@@ -1,12 +1,11 @@
 ---
 title: Summary
 ---
-[](){ #new-model }
 
 !!! important
     Many decoder language models can now be automatically loaded using the [Transformers backend][transformers-backend] without having to implement them in vLLM. See if `vllm serve <model>` works first!
 
-vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features][compatibility-matrix] to optimize their performance.
+vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/compatibility_matrix.md) to optimize their performance.
 
 The complexity of integrating a model into vLLM depends heavily on the model's architecture.
 The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md
index 78289bf381d77..f4f3085dc4e2a 100644
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -1,7 +1,6 @@
 ---
 title: Basic Model
 ---
-[](){ #new-model-basic }
 
 This guide walks you through the steps to implement a basic vLLM model.
 
@@ -108,7 +107,7 @@ This method should load the weights from the HuggingFace's checkpoint file and a
 
 ## 5. Register your model
 
-See [this page][new-model-registration] for instructions on how to register your new model to be used by vLLM.
+See [this page](registration.md) for instructions on how to register your new model to be used by vLLM.
 
 ## Frequently Asked Questions
 
diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md
index 201ace0ab0802..ced1480ddcc47 100644
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@@ -1,13 +1,12 @@
 ---
 title: Multi-Modal Support
 ---
-[](){ #supports-multimodal }
 
-This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs][multimodal-inputs].
+This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs](../../features/multimodal_inputs.md).
 
 ## 1. Update the base vLLM model
 
-It is assumed that you have already implemented the model in vLLM according to [these steps][new-model-basic].
+It is assumed that you have already implemented the model in vLLM according to [these steps](basic.md).
 Further update the model as follows:
 
 - Implement [get_placeholder_str][vllm.model_executor.models.interfaces.SupportsMultiModal.get_placeholder_str] to define the placeholder string which is used to represent the multi-modal item in the text prompt. This should be consistent with the chat template of the model.
@@ -483,7 +482,7 @@ Afterwards, create a subclass of [BaseMultiModalProcessor][vllm.multimodal.proce
 to fill in the missing details about HF processing.
 
 !!! info
-    [Multi-Modal Data Processing][mm-processing]
+    [Multi-Modal Data Processing](../../design/mm_processing.md)
 
 ### Multi-modal fields
 
@@ -846,7 +845,7 @@ Examples:
 
 ### Handling prompt updates unrelated to multi-modal data
 
-[_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] assumes that each application of prompt update corresponds to one multi-modal item. If the HF processor performs additional processing regardless of how many multi-modal items there are, you should override [_apply_hf_processor_tokens_only][vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_tokens_only] so that the processed token inputs are consistent with the result of applying the HF processor on text inputs. This is because token inputs bypass the HF processor according to [our design][mm-processing].
+[_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] assumes that each application of prompt update corresponds to one multi-modal item. If the HF processor performs additional processing regardless of how many multi-modal items there are, you should override [_apply_hf_processor_tokens_only][vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_tokens_only] so that the processed token inputs are consistent with the result of applying the HF processor on text inputs. This is because token inputs bypass the HF processor according to [our design](../../design/mm_processing.md).
 
 Examples:
 
diff --git a/docs/contributing/model/registration.md b/docs/contributing/model/registration.md
index 758caa72cd4a0..46f50a6ec90de 100644
--- a/docs/contributing/model/registration.md
+++ b/docs/contributing/model/registration.md
@@ -1,10 +1,9 @@
 ---
 title: Registering a Model
 ---
-[](){ #new-model-registration }
 
 vLLM relies on a model registry to determine how to run each model.
-A list of pre-registered architectures can be found [here][supported-models].
+A list of pre-registered architectures can be found [here](../../models/supported_models.md).
 
 If your model is not on this list, you must register it to vLLM.
 This page provides detailed instructions on how to do so.
@@ -14,16 +13,16 @@ This page provides detailed instructions on how to do so.
 To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source][build-from-source].
 This gives you the ability to modify the codebase and test your model.
 
-After you have implemented your model (see [tutorial][new-model-basic]), put it into the <gh-dir:vllm/model_executor/models> directory.
+After you have implemented your model (see [tutorial](basic.md)), put it into the <gh-dir:vllm/model_executor/models> directory.
 Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
-Finally, update our [list of supported models][supported-models] to promote your model!
+Finally, update our [list of supported models](../../models/supported_models.md) to promote your model!
 
 !!! important
     The list of models in each section should be maintained in alphabetical order.
 
 ## Out-of-tree models
 
-You can load an external model [using a plugin][plugin-system] without modifying the vLLM codebase.
+You can load an external model [using a plugin](../../design/plugin_system.md) without modifying the vLLM codebase.
 
 To register the model, use the following code:
 
@@ -51,4 +50,4 @@ def register():
 
 !!! important
     If your model is a multimodal model, ensure the model class implements the [SupportsMultiModal][vllm.model_executor.models.interfaces.SupportsMultiModal] interface.
-    Read more about that [here][supports-multimodal].
+    Read more about that [here](multimodal.md).
diff --git a/docs/contributing/model/tests.md b/docs/contributing/model/tests.md
index c7bcc02a8b809..134a73449be6d 100644
--- a/docs/contributing/model/tests.md
+++ b/docs/contributing/model/tests.md
@@ -1,7 +1,6 @@
 ---
 title: Unit Testing
 ---
-[](){ #new-model-tests }
 
 This page explains how to write unit tests to verify the implementation of your model.
 
diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md
index 38633860b6179..daf2031938647 100644
--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@@ -1,7 +1,6 @@
 ---
 title: Using Docker
 ---
-[](){ #deployment-docker }
 
 [](){ #deployment-docker-pre-built-image }
 
@@ -32,7 +31,7 @@ podman run --gpus all \
   --model mistralai/Mistral-7B-v0.1
 ```
 
-You can add any other [engine-args][engine-args] you need after the image tag (`vllm/vllm-openai:latest`).
+You can add any other [engine-args](../configuration/engine_args.md) you need after the image tag (`vllm/vllm-openai:latest`).
 
 !!! note
     You can either use the `ipc=host` flag or `--shm-size` flag to allow the
diff --git a/docs/deployment/frameworks/anything-llm.md b/docs/deployment/frameworks/anything-llm.md
index 4633c2946cde8..6cead082e1af0 100644
--- a/docs/deployment/frameworks/anything-llm.md
+++ b/docs/deployment/frameworks/anything-llm.md
@@ -1,7 +1,6 @@
 ---
 title: Anything LLM
 ---
-[](){ #deployment-anything-llm }
 
 [Anything LLM](https://github.com/Mintplex-Labs/anything-llm) is a full-stack application that enables you to turn any document, resource, or piece of content into context that any LLM can use as references during chatting.
 
diff --git a/docs/deployment/frameworks/autogen.md b/docs/deployment/frameworks/autogen.md
index 91127bed2854e..8510d063b8391 100644
--- a/docs/deployment/frameworks/autogen.md
+++ b/docs/deployment/frameworks/autogen.md
@@ -1,7 +1,6 @@
 ---
 title: AutoGen
 ---
-[](){ #deployment-autogen }
 
 [AutoGen](https://github.com/microsoft/autogen) is a framework for creating multi-agent AI applications that can act autonomously or work alongside humans.
 
diff --git a/docs/deployment/frameworks/bentoml.md b/docs/deployment/frameworks/bentoml.md
index 7e64b6eb6fb03..a11fc4804e44f 100644
--- a/docs/deployment/frameworks/bentoml.md
+++ b/docs/deployment/frameworks/bentoml.md
@@ -1,7 +1,6 @@
 ---
 title: BentoML
 ---
-[](){ #deployment-bentoml }
 
 [BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-compliant image and deploy it on Kubernetes.
 
diff --git a/docs/deployment/frameworks/cerebrium.md b/docs/deployment/frameworks/cerebrium.md
index d47773dd0c86e..3a8d6627312bd 100644
--- a/docs/deployment/frameworks/cerebrium.md
+++ b/docs/deployment/frameworks/cerebrium.md
@@ -1,7 +1,6 @@
 ---
 title: Cerebrium
 ---
-[](){ #deployment-cerebrium }
 
 <p align="center">
     <img src="https://i.ibb.co/hHcScTT/Screenshot-2024-06-13-at-10-14-54.png" alt="vLLM_plus_cerebrium"/>
diff --git a/docs/deployment/frameworks/chatbox.md b/docs/deployment/frameworks/chatbox.md
index b1b50b55146ca..0dd97633b382d 100644
--- a/docs/deployment/frameworks/chatbox.md
+++ b/docs/deployment/frameworks/chatbox.md
@@ -1,7 +1,6 @@
 ---
 title: Chatbox
 ---
-[](){ #deployment-chatbox }
 
 [Chatbox](https://github.com/chatboxai/chatbox) is a desktop client for LLMs, available on Windows, Mac, Linux.
 
diff --git a/docs/deployment/frameworks/dify.md b/docs/deployment/frameworks/dify.md
index a0e40784f0ea4..e08fdafb6c843 100644
--- a/docs/deployment/frameworks/dify.md
+++ b/docs/deployment/frameworks/dify.md
@@ -1,7 +1,6 @@
 ---
 title: Dify
 ---
-[](){ #deployment-dify }
 
 [Dify](https://github.com/langgenius/dify) is an open-source LLM app development platform. Its intuitive interface combines agentic AI workflow, RAG pipeline, agent capabilities, model management, observability features, and more, allowing you to quickly move from prototype to production.
 
diff --git a/docs/deployment/frameworks/dstack.md b/docs/deployment/frameworks/dstack.md
index 8be655e23a2ea..750df67223cb8 100644
--- a/docs/deployment/frameworks/dstack.md
+++ b/docs/deployment/frameworks/dstack.md
@@ -1,7 +1,6 @@
 ---
 title: dstack
 ---
-[](){ #deployment-dstack }
 
 <p align="center">
     <img src="https://i.ibb.co/71kx6hW/vllm-dstack.png" alt="vLLM_plus_dstack"/>
diff --git a/docs/deployment/frameworks/haystack.md b/docs/deployment/frameworks/haystack.md
index 0a52d017c301d..d069bda0e815e 100644
--- a/docs/deployment/frameworks/haystack.md
+++ b/docs/deployment/frameworks/haystack.md
@@ -1,7 +1,6 @@
 ---
 title: Haystack
 ---
-[](){ #deployment-haystack }
 
 # Haystack
 
diff --git a/docs/deployment/frameworks/helm.md b/docs/deployment/frameworks/helm.md
index d929665e8a3df..4dacfdf352df7 100644
--- a/docs/deployment/frameworks/helm.md
+++ b/docs/deployment/frameworks/helm.md
@@ -1,7 +1,6 @@
 ---
 title: Helm
 ---
-[](){ #deployment-helm }
 
 A Helm chart to deploy vLLM for Kubernetes
 
diff --git a/docs/deployment/frameworks/litellm.md b/docs/deployment/frameworks/litellm.md
index c7cdd1020f2a9..8499cebc6fd02 100644
--- a/docs/deployment/frameworks/litellm.md
+++ b/docs/deployment/frameworks/litellm.md
@@ -1,7 +1,6 @@
 ---
 title: LiteLLM
 ---
-[](){ #deployment-litellm }
 
 [LiteLLM](https://github.com/BerriAI/litellm) call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, Groq etc.]
 
diff --git a/docs/deployment/frameworks/lobe-chat.md b/docs/deployment/frameworks/lobe-chat.md
index cd95c028155e4..22e62ad615ae5 100644
--- a/docs/deployment/frameworks/lobe-chat.md
+++ b/docs/deployment/frameworks/lobe-chat.md
@@ -1,7 +1,6 @@
 ---
 title: Lobe Chat
 ---
-[](){ #deployment-lobe-chat }
 
 [Lobe Chat](https://github.com/lobehub/lobe-chat) is an open-source, modern-design ChatGPT/LLMs UI/Framework.
 
diff --git a/docs/deployment/frameworks/lws.md b/docs/deployment/frameworks/lws.md
index d0ca6d6dd054d..633949bf32d8b 100644
--- a/docs/deployment/frameworks/lws.md
+++ b/docs/deployment/frameworks/lws.md
@@ -1,7 +1,6 @@
 ---
 title: LWS
 ---
-[](){ #deployment-lws }
 
 LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads.
 A major use case is for multi-host/multi-node distributed inference.
diff --git a/docs/deployment/frameworks/modal.md b/docs/deployment/frameworks/modal.md
index dbdb739a10005..feb6f698009d9 100644
--- a/docs/deployment/frameworks/modal.md
+++ b/docs/deployment/frameworks/modal.md
@@ -1,7 +1,6 @@
 ---
 title: Modal
 ---
-[](){ #deployment-modal }
 
 vLLM can be run on cloud GPUs with [Modal](https://modal.com), a serverless computing platform designed for fast auto-scaling.
 
diff --git a/docs/deployment/frameworks/open-webui.md b/docs/deployment/frameworks/open-webui.md
index 676a0f58b54f8..53d21b4325611 100644
--- a/docs/deployment/frameworks/open-webui.md
+++ b/docs/deployment/frameworks/open-webui.md
@@ -1,7 +1,6 @@
 ---
 title: Open WebUI
 ---
-[](){ #deployment-open-webui }
 
 1. Install the [Docker](https://docs.docker.com/engine/install/)
 
diff --git a/docs/deployment/frameworks/retrieval_augmented_generation.md b/docs/deployment/frameworks/retrieval_augmented_generation.md
index 851c31db32f27..059bdf0309723 100644
--- a/docs/deployment/frameworks/retrieval_augmented_generation.md
+++ b/docs/deployment/frameworks/retrieval_augmented_generation.md
@@ -1,7 +1,6 @@
 ---
 title: Retrieval-Augmented Generation
 ---
-[](){ #deployment-retrieval-augmented-generation }
 
 [Retrieval-augmented generation (RAG)](https://en.wikipedia.org/wiki/Retrieval-augmented_generation) is a technique that enables generative artificial intelligence (Gen AI) models to retrieve and incorporate new information. It modifies interactions with a large language model (LLM) so that the model responds to user queries with reference to a specified set of documents, using this information to supplement information from its pre-existing training data. This allows LLMs to use domain-specific and/or updated information. Use cases include providing chatbot access to internal company data or generating responses based on authoritative sources.
 
diff --git a/docs/deployment/frameworks/skypilot.md b/docs/deployment/frameworks/skypilot.md
index a0efc50416b40..ffa59a17e2fa0 100644
--- a/docs/deployment/frameworks/skypilot.md
+++ b/docs/deployment/frameworks/skypilot.md
@@ -1,7 +1,6 @@
 ---
 title: SkyPilot
 ---
-[](){ #deployment-skypilot }
 
 <p align="center">
   <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
diff --git a/docs/deployment/frameworks/streamlit.md b/docs/deployment/frameworks/streamlit.md
index 5e998e3cca6e4..6445ab68e3411 100644
--- a/docs/deployment/frameworks/streamlit.md
+++ b/docs/deployment/frameworks/streamlit.md
@@ -1,7 +1,6 @@
 ---
 title: Streamlit
 ---
-[](){ #deployment-streamlit }
 
 [Streamlit](https://github.com/streamlit/streamlit) lets you transform Python scripts into interactive web apps in minutes, instead of weeks. Build dashboards, generate reports, or create chat apps.
 
diff --git a/docs/deployment/frameworks/triton.md b/docs/deployment/frameworks/triton.md
index 082bc24d85aad..ef6b6f9325b92 100644
--- a/docs/deployment/frameworks/triton.md
+++ b/docs/deployment/frameworks/triton.md
@@ -1,6 +1,5 @@
 ---
 title: NVIDIA Triton
 ---
-[](){ #deployment-triton }
 
 The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details.
diff --git a/docs/deployment/integrations/kserve.md b/docs/deployment/integrations/kserve.md
index 754b983dee92c..b61112b3a91bd 100644
--- a/docs/deployment/integrations/kserve.md
+++ b/docs/deployment/integrations/kserve.md
@@ -1,7 +1,6 @@
 ---
 title: KServe
 ---
-[](){ #deployment-kserve }
 
 vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
 
diff --git a/docs/deployment/integrations/kubeai.md b/docs/deployment/integrations/kubeai.md
index ba0a3c52cca7a..37604b8feef4c 100644
--- a/docs/deployment/integrations/kubeai.md
+++ b/docs/deployment/integrations/kubeai.md
@@ -1,7 +1,6 @@
 ---
 title: KubeAI
 ---
-[](){ #deployment-kubeai }
 
 [KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.
 
diff --git a/docs/deployment/integrations/llamastack.md b/docs/deployment/integrations/llamastack.md
index 9bbc6b5b296c9..cf328054621d8 100644
--- a/docs/deployment/integrations/llamastack.md
+++ b/docs/deployment/integrations/llamastack.md
@@ -1,7 +1,6 @@
 ---
 title: Llama Stack
 ---
-[](){ #deployment-llamastack }
 
 vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) .
 
diff --git a/docs/deployment/integrations/llmaz.md b/docs/deployment/integrations/llmaz.md
index 03d284c34769c..87772ec6ce088 100644
--- a/docs/deployment/integrations/llmaz.md
+++ b/docs/deployment/integrations/llmaz.md
@@ -1,7 +1,6 @@
 ---
 title: llmaz
 ---
-[](){ #deployment-llmaz }
 
 [llmaz](https://github.com/InftyAI/llmaz) is an easy-to-use and advanced inference platform for large language models on Kubernetes, aimed for production use. It uses vLLM as the default model serving backend.
 
diff --git a/docs/deployment/integrations/production-stack.md b/docs/deployment/integrations/production-stack.md
index d9e77dd343f5f..19371061a5c10 100644
--- a/docs/deployment/integrations/production-stack.md
+++ b/docs/deployment/integrations/production-stack.md
@@ -1,7 +1,6 @@
 ---
 title: Production stack
 ---
-[](){ #deployment-production-stack }
 
 Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using the [vLLM production stack](https://github.com/vllm-project/production-stack). Born out of a Berkeley-UChicago collaboration, [vLLM production stack](https://github.com/vllm-project/production-stack) is an officially released, production-optimized codebase under the [vLLM project](https://github.com/vllm-project), designed for LLM deployment with:
 
diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md
index 84e65603d7b1a..8eb69527c8472 100644
--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@@ -1,7 +1,6 @@
 ---
 title: Using Kubernetes
 ---
-[](){ #deployment-k8s }
 
 Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes.
 
diff --git a/docs/deployment/nginx.md b/docs/deployment/nginx.md
index fc8ee3f5e35f5..2cdf766d11950 100644
--- a/docs/deployment/nginx.md
+++ b/docs/deployment/nginx.md
@@ -1,7 +1,6 @@
 ---
 title: Using Nginx
 ---
-[](){ #nginxloadbalancer }
 
 This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
 
diff --git a/docs/design/arch_overview.md b/docs/design/arch_overview.md
index 36928369acdd5..27676bc2e919f 100644
--- a/docs/design/arch_overview.md
+++ b/docs/design/arch_overview.md
@@ -1,7 +1,6 @@
 ---
 title: Architecture Overview
 ---
-[](){ #arch-overview }
 
 This document provides an overview of the vLLM architecture.
 
@@ -74,7 +73,7 @@ python -m vllm.entrypoints.openai.api_server --model <model>
 
 That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
 
-More details on the API server can be found in the [OpenAI-Compatible Server][serving-openai-compatible-server] document.
+More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.
 
 ## LLM Engine
 
@@ -132,7 +131,7 @@ input tensors and capturing cudagraphs.
 ## Model
 
 Every model runner object has one model object, which is the actual
-`torch.nn.Module` instance. See [huggingface_integration][huggingface-integration] for how various
+`torch.nn.Module` instance. See [huggingface_integration](huggingface_integration.md) for how various
 configurations affect the class we ultimately get.
 
 ## Class Hierarchy
diff --git a/docs/design/automatic_prefix_caching.md b/docs/design/automatic_prefix_caching.md
index 80883bb1d90d8..88b3d0b66e70d 100644
--- a/docs/design/automatic_prefix_caching.md
+++ b/docs/design/automatic_prefix_caching.md
@@ -1,7 +1,6 @@
 ---
 title: Automatic Prefix Caching
 ---
-[](){ #design-automatic-prefix-caching }
 
 The core idea of [PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
 
diff --git a/docs/design/huggingface_integration.md b/docs/design/huggingface_integration.md
index 2d462ccb65350..100f931ec6123 100644
--- a/docs/design/huggingface_integration.md
+++ b/docs/design/huggingface_integration.md
@@ -1,7 +1,6 @@
 ---
 title: Integration with HuggingFace
 ---
-[](){ #huggingface-integration }
 
 This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run `vllm serve`.
 
diff --git a/docs/design/kernel/paged_attention.md b/docs/design/kernel/paged_attention.md
index 8c0eb05018e76..bd81d817895d5 100644
--- a/docs/design/kernel/paged_attention.md
+++ b/docs/design/kernel/paged_attention.md
@@ -1,7 +1,6 @@
 ---
 title: vLLM Paged Attention
 ---
-[](){ #design-paged-attention }
 
 Currently, vLLM utilizes its own implementation of a multi-head query
 attention kernel (`csrc/attention/attention_kernels.cu`).
diff --git a/docs/design/mm_processing.md b/docs/design/mm_processing.md
index f3685ce76a4bd..75c986269df5a 100644
--- a/docs/design/mm_processing.md
+++ b/docs/design/mm_processing.md
@@ -1,9 +1,8 @@
 ---
 title: Multi-Modal Data Processing
 ---
-[](){ #mm-processing }
 
-To enable various optimizations in vLLM such as [chunked prefill][chunked-prefill] and [prefix caching][automatic-prefix-caching], we use [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
+To enable various optimizations in vLLM such as [chunked prefill][chunked-prefill] and [prefix caching](../features/automatic_prefix_caching.md), we use [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
 
 Here are the main features of [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor]:
 
diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md
index 959c9cefc1c54..35372b5ea03bc 100644
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -1,13 +1,12 @@
 ---
 title: vLLM's Plugin System
 ---
-[](){ #plugin-system }
 
 The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM.
 
 ## How Plugins Work in vLLM
 
-Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [Arch Overview][arch-overview]), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work.
+Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [Arch Overview](arch_overview.md)), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work.
 
 ## How vLLM Discovers Plugins
 
diff --git a/docs/features/automatic_prefix_caching.md b/docs/features/automatic_prefix_caching.md
index 5e92796ddda7e..73ff1757371fa 100644
--- a/docs/features/automatic_prefix_caching.md
+++ b/docs/features/automatic_prefix_caching.md
@@ -1,14 +1,13 @@
 ---
 title: Automatic Prefix Caching
 ---
-[](){ #automatic-prefix-caching }
 
 ## Introduction
 
 Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
 
 !!! note
-    Technical details on how vLLM implements APC can be found [here][design-automatic-prefix-caching].
+    Technical details on how vLLM implements APC can be found [here](../design/automatic_prefix_caching.md).
 
 ## Enabling APC in vLLM
 
diff --git a/docs/features/compatibility_matrix.md b/docs/features/compatibility_matrix.md
index 4f475ee4db834..d71e9fafd6298 100644
--- a/docs/features/compatibility_matrix.md
+++ b/docs/features/compatibility_matrix.md
@@ -1,7 +1,6 @@
 ---
 title: Compatibility Matrix
 ---
-[](){ #compatibility-matrix }
 
 The tables below show mutually exclusive features and the support on some hardware.
 
@@ -37,13 +36,13 @@ th:not(:first-child) {
 }
 </style>
 
-| Feature | [CP][chunked-prefill] | [APC][automatic-prefix-caching] | [LoRA][lora-adapter] | <abbr title="Prompt Adapter">prmpt adptr</abbr> | [SD][spec-decode] | CUDA graph | <abbr title="Pooling Models">pooling</abbr> | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search |
+| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | <abbr title="Prompt Adapter">prmpt adptr</abbr> | [SD](spec_decode.md) | CUDA graph | <abbr title="Pooling Models">pooling</abbr> | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search |
 |---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
 | [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | |
-| [APC][automatic-prefix-caching] | ✅ | ✅ | | | | | | | | | | | | | |
-| [LoRA][lora-adapter] | ✅ | ✅ | ✅ | | | | | | | | | | | | |
+| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | |
+| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | |
 | <abbr title="Prompt Adapter">prmpt adptr</abbr> | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | |
-| [SD][spec-decode] | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | | | | |
+| [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | | | | |
 | CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | |
 | <abbr title="Pooling Models">pooling</abbr> | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | | | | | | | | |
 | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | |
@@ -62,10 +61,10 @@ th:not(:first-child) {
 | Feature                                                   | Volta               | Turing    | Ampere    | Ada    | Hopper     | CPU                | AMD    | TPU |
 |-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----|
 | [CP][chunked-prefill]                                     | [❌](gh-issue:2729) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ |
-| [APC][automatic-prefix-caching]                           | [❌](gh-issue:3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ |
-| [LoRA][lora-adapter]                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ |
+| [APC](automatic_prefix_caching.md)                           | [❌](gh-issue:3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ |
+| [LoRA](lora.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ |
 | <abbr title="Prompt Adapter">prmpt adptr</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](gh-issue:8475) | ✅     | ❌ |
-| [SD][spec-decode]                                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| [SD](spec_decode.md)                                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
 | CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ❌ |
 | <abbr title="Pooling Models">pooling</abbr>               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❔     | ❌ |
 | <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ❌ |
diff --git a/docs/features/disagg_prefill.md b/docs/features/disagg_prefill.md
index 54be05647d940..5b45b676ee90d 100644
--- a/docs/features/disagg_prefill.md
+++ b/docs/features/disagg_prefill.md
@@ -1,7 +1,6 @@
 ---
 title: Disaggregated Prefilling (experimental)
 ---
-[](){ #disagg-prefill }
 
 This page introduces you the disaggregated prefilling feature in vLLM.
 
diff --git a/docs/features/lora.md b/docs/features/lora.md
index 64d40a72994db..5ede7c42976c7 100644
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -1,7 +1,6 @@
 ---
 title: LoRA Adapters
 ---
-[](){ #lora-adapter }
 
 This document shows you how to use [LoRA adapters](https://arxiv.org/abs/2106.09685) with vLLM on top of a base model.
 
diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index 7c25f6f406a3f..644c9d03af97c 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -1,7 +1,6 @@
 ---
 title: Multimodal Inputs
 ---
-[](){ #multimodal-inputs }
 
 This page teaches you how to pass multi-modal inputs to [multi-modal models][supported-mm-models] in vLLM.
 
diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md
index 614b43dd00444..73d54b8dca851 100644
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@@ -1,7 +1,6 @@
 ---
 title: Quantization
 ---
-[](){ #quantization-index }
 
 Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices.
 
diff --git a/docs/features/quantization/auto_awq.md b/docs/features/quantization/auto_awq.md
index 2361a27a499dd..97227e54c356c 100644
--- a/docs/features/quantization/auto_awq.md
+++ b/docs/features/quantization/auto_awq.md
@@ -1,7 +1,6 @@
 ---
 title: AutoAWQ
 ---
-[](){ #auto-awq }
 
 To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
 Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint.
diff --git a/docs/features/quantization/bitblas.md b/docs/features/quantization/bitblas.md
index d1a431ddc9319..8ad1e1dea299b 100644
--- a/docs/features/quantization/bitblas.md
+++ b/docs/features/quantization/bitblas.md
@@ -1,7 +1,6 @@
 ---
 title: BitBLAS
 ---
-[](){ #bitblas }
 
 vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more efficient and flexible model inference. Compared to other quantization frameworks, BitBLAS provides more precision combinations.
 
diff --git a/docs/features/quantization/bnb.md b/docs/features/quantization/bnb.md
index ca13ee107ef43..11c37547863b3 100644
--- a/docs/features/quantization/bnb.md
+++ b/docs/features/quantization/bnb.md
@@ -1,7 +1,6 @@
 ---
 title: BitsAndBytes
 ---
-[](){ #bits-and-bytes }
 
 vLLM now supports [BitsAndBytes](https://github.com/TimDettmers/bitsandbytes) for more efficient model inference.
 BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy.
diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md
index 65b4285a5418b..03aec160ea1ca 100644
--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@@ -1,7 +1,6 @@
 ---
 title: FP8 W8A8
 ---
-[](){ #fp8 }
 
 vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x.
 Currently, only Hopper and Ada Lovelace GPUs are officially supported for W8A8.
diff --git a/docs/features/quantization/gguf.md b/docs/features/quantization/gguf.md
index 60b3bcd2a5aae..564b999fecd9c 100644
--- a/docs/features/quantization/gguf.md
+++ b/docs/features/quantization/gguf.md
@@ -1,7 +1,6 @@
 ---
 title: GGUF
 ---
-[](){ #gguf }
 
 !!! warning
     Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team.
diff --git a/docs/features/quantization/gptqmodel.md b/docs/features/quantization/gptqmodel.md
index 500803c208a40..402e0cb3b2bf9 100644
--- a/docs/features/quantization/gptqmodel.md
+++ b/docs/features/quantization/gptqmodel.md
@@ -1,7 +1,6 @@
 ---
 title: GPTQModel
 ---
-[](){ #gptqmodel }
 
 To create a new 4-bit or 8-bit GPTQ quantized model, you can leverage [GPTQModel](https://github.com/ModelCloud/GPTQModel) from ModelCloud.AI.
 
diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md
index 8d9fe46818ebf..a76852cf82312 100644
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@@ -1,7 +1,6 @@
 ---
 title: INT4 W4A16
 ---
-[](){ #int4 }
 
 vLLM supports quantizing weights to INT4 for memory savings and inference acceleration. This quantization method is particularly useful for reducing model size and maintaining low latency in workloads with low queries per second (QPS).
 
diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md
index 3635e841b8148..e1ced47ab9155 100644
--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@@ -1,7 +1,6 @@
 ---
 title: INT8 W8A8
 ---
-[](){ #int8 }
 
 vLLM supports quantizing weights and activations to INT8 for memory savings and inference acceleration.
 This quantization method is particularly useful for reducing model size while maintaining good performance.
diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md
index e76547d0e9c68..2b0622f197482 100644
--- a/docs/features/quantization/quantized_kvcache.md
+++ b/docs/features/quantization/quantized_kvcache.md
@@ -1,7 +1,6 @@
 ---
 title: Quantized KV Cache
 ---
-[](){ #quantized-kvcache }
 
 ## FP8 KV Cache
 
diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md
index 13afbc1e058e2..288a636326c99 100644
--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@@ -1,7 +1,6 @@
 ---
 title: AMD Quark
 ---
-[](){ #quark }
 
 Quantization can effectively reduce memory and bandwidth usage, accelerate computation and improve
 throughput while with minimal accuracy loss. vLLM can leverage [Quark](https://quark.docs.amd.com/latest/),
diff --git a/docs/features/quantization/supported_hardware.md b/docs/features/quantization/supported_hardware.md
index 6a585b1ccb2ca..d66972792d574 100644
--- a/docs/features/quantization/supported_hardware.md
+++ b/docs/features/quantization/supported_hardware.md
@@ -1,7 +1,6 @@
 ---
 title: Supported Hardware
 ---
-[](){ #quantization-supported-hardware }
 
 The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
 
diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index 90232a536cccc..d6ee2955b8965 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -1,7 +1,6 @@
 ---
 title: Reasoning Outputs
 ---
-[](){ #reasoning-outputs }
 
 vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions.
 
diff --git a/docs/features/spec_decode.md b/docs/features/spec_decode.md
index e22cc65cae99e..9c63974d0e2ad 100644
--- a/docs/features/spec_decode.md
+++ b/docs/features/spec_decode.md
@@ -1,7 +1,6 @@
 ---
 title: Speculative Decoding
 ---
-[](){ #spec-decode }
 
 !!! warning
     Please note that speculative decoding in vLLM is not yet optimized and does
@@ -269,7 +268,7 @@ speculative decoding, breaking down the guarantees into three key areas:
 3. **vLLM Logprob Stability**
    \- vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the
    same request across runs. For more details, see the FAQ section
-   titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs][faq].
+   titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](../usage/faq.md).
 
 While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding
 can occur due to following factors:
@@ -278,7 +277,7 @@ can occur due to following factors:
 - **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially
   due to non-deterministic behavior in batched operations or numerical instability.
 
-For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs][faq].
+For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](../usage/faq.md).
 
 ## Resources for vLLM contributors
 
diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
index c56ad400819bc..84d6ea4fe51e4 100644
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -1,7 +1,6 @@
 ---
 title: Structured Outputs
 ---
-[](){ #structured-outputs }
 
 vLLM supports the generation of structured outputs using
 [xgrammar](https://github.com/mlc-ai/xgrammar) or
@@ -21,7 +20,7 @@ The following parameters are supported, which must be added as extra parameters:
 - `guided_grammar`: the output will follow the context free grammar.
 - `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text.
 
-You can see the complete list of supported parameters on the [OpenAI-Compatible Server][serving-openai-compatible-server] page.
+You can see the complete list of supported parameters on the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) page.
 
 Structured outputs are supported by default in the OpenAI-Compatible Server. You
 may choose to specify the backend to use by setting the
diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md
index c5348adfa5283..274e7560e46fe 100644
--- a/docs/getting_started/installation/README.md
+++ b/docs/getting_started/installation/README.md
@@ -1,7 +1,6 @@
 ---
 title: Installation
 ---
-[](){ #installation-index }
 
 vLLM supports the following hardware platforms:
 
diff --git a/docs/getting_started/installation/intel_gaudi.md b/docs/getting_started/installation/intel_gaudi.md
index e1bba1eaba4a3..061599cb1b630 100644
--- a/docs/getting_started/installation/intel_gaudi.md
+++ b/docs/getting_started/installation/intel_gaudi.md
@@ -109,8 +109,8 @@ docker run \
 
 ### Supported features
 
-- [Offline inference][offline-inference]
-- Online serving via [OpenAI-Compatible Server][serving-openai-compatible-server]
+- [Offline inference](../../serving/offline_inference.md)
+- Online serving via [OpenAI-Compatible Server](../../serving/openai_compatible_server.md)
 - HPU autodetection - no need to manually select device within vLLM
 - Paged KV cache with algorithms enabled for Intel Gaudi accelerators
 - Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index 216e93ac05b89..2decd15f033e8 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -1,7 +1,6 @@
 ---
 title: Quickstart
 ---
-[](){ #quickstart }
 
 This guide will help you quickly get started with vLLM to perform:
 
@@ -43,7 +42,7 @@ uv pip install vllm --torch-backend=auto
 ```
 
 !!! note
-    For more detail and non-CUDA platforms, please refer [here][installation-index] for specific instructions on how to install vLLM.
+    For more detail and non-CUDA platforms, please refer [here](installation/README.md) for specific instructions on how to install vLLM.
 
 [](){ #quickstart-offline }
 
@@ -77,7 +76,7 @@ prompts = [
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 ```
 
-The [LLM][vllm.LLM] class initializes vLLM's engine and the [OPT-125M model](https://arxiv.org/abs/2205.01068) for offline inference. The list of supported models can be found [here][supported-models].
+The [LLM][vllm.LLM] class initializes vLLM's engine and the [OPT-125M model](https://arxiv.org/abs/2205.01068) for offline inference. The list of supported models can be found [here](../models/supported_models.md).
 
 ```python
 llm = LLM(model="facebook/opt-125m")
diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py
index 7cfc89605150e..14a28f944d98f 100644
--- a/docs/mkdocs/hooks/generate_examples.py
+++ b/docs/mkdocs/hooks/generate_examples.py
@@ -1,19 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
+import logging
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Literal
 
 import regex as re
 
+logger = logging.getLogger("mkdocs")
+
 ROOT_DIR = Path(__file__).parent.parent.parent.parent
 ROOT_DIR_RELATIVE = '../../../../..'
 EXAMPLE_DIR = ROOT_DIR / "examples"
 EXAMPLE_DOC_DIR = ROOT_DIR / "docs/examples"
-print(ROOT_DIR.resolve())
-print(EXAMPLE_DIR.resolve())
-print(EXAMPLE_DOC_DIR.resolve())
 
 
 def fix_case(text: str) -> str:
@@ -135,6 +135,11 @@ class Example:
 
 
 def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
+    logger.info("Generating example documentation")
+    logger.debug("Root directory: %s", ROOT_DIR.resolve())
+    logger.debug("Example directory: %s", EXAMPLE_DIR.resolve())
+    logger.debug("Example document directory: %s", EXAMPLE_DOC_DIR.resolve())
+
     # Create the EXAMPLE_DOC_DIR if it doesn't exist
     if not EXAMPLE_DOC_DIR.exists():
         EXAMPLE_DOC_DIR.mkdir(parents=True)
@@ -156,7 +161,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
     for example in sorted(examples, key=lambda e: e.path.stem):
         example_name = f"{example.path.stem}.md"
         doc_path = EXAMPLE_DOC_DIR / example.category / example_name
-        print(doc_path)
+        logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR))
         if not doc_path.parent.exists():
             doc_path.parent.mkdir(parents=True)
         with open(doc_path, "w+") as f:
diff --git a/docs/models/extensions/runai_model_streamer.md b/docs/models/extensions/runai_model_streamer.md
index 60b43d21d9f68..b0affe7a4b11d 100644
--- a/docs/models/extensions/runai_model_streamer.md
+++ b/docs/models/extensions/runai_model_streamer.md
@@ -1,7 +1,6 @@
 ---
 title: Loading models with Run:ai Model Streamer
 ---
-[](){ #runai-model-streamer }
 
 Run:ai Model Streamer is a library to read tensors in concurrency, while streaming it to GPU memory.
 Further reading can be found in [Run:ai Model Streamer Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/README.md).
diff --git a/docs/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md
index e0b4479c0beb6..09afca3966e54 100644
--- a/docs/models/extensions/tensorizer.md
+++ b/docs/models/extensions/tensorizer.md
@@ -1,7 +1,6 @@
 ---
 title: Loading models with CoreWeave's Tensorizer
 ---
-[](){ #tensorizer }
 
 vLLM supports loading models with [CoreWeave's Tensorizer](https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer).
 vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized
diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md
index 53469245f01b1..e51b56fa6b7cf 100644
--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@@ -1,7 +1,6 @@
 ---
 title: Generative Models
 ---
-[](){ #generative-models }
 
 vLLM provides first-class support for generative models, which covers most of LLMs.
 
@@ -134,7 +133,7 @@ outputs = llm.chat(conversation, chat_template=custom_template)
 
 ## Online Serving
 
-Our [OpenAI-Compatible Server][serving-openai-compatible-server] provides endpoints that correspond to the offline APIs:
+Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
 
 - [Completions API][completions-api] is similar to `LLM.generate` but only accepts text.
-- [Chat API][chat-api]  is similar to `LLM.chat`, accepting both text and [multi-modal inputs][multimodal-inputs] for models with a chat template.
+- [Chat API][chat-api]  is similar to `LLM.chat`, accepting both text and [multi-modal inputs](../features/multimodal_inputs.md) for models with a chat template.
diff --git a/docs/models/hardware_supported_models/tpu.md b/docs/models/hardware_supported_models/tpu.md
index dca5e20cb3431..1e0449b5fdeb5 100644
--- a/docs/models/hardware_supported_models/tpu.md
+++ b/docs/models/hardware_supported_models/tpu.md
@@ -1,7 +1,6 @@
 ---
 title: TPU
 ---
-[](){ #tpu-supported-models }
 
 # TPU Supported Models
 ## Text-only Language Models
diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index 693212e64bd27..c659fc567927d 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -1,7 +1,6 @@
 ---
 title: Pooling Models
 ---
-[](){ #pooling-models }
 
 vLLM also supports pooling models, including embedding, reranking and reward models.
 
@@ -11,7 +10,7 @@ before returning them.
 
 !!! note
     We currently support pooling models primarily as a matter of convenience.
-    As shown in the [Compatibility Matrix][compatibility-matrix], most vLLM features are not applicable to
+    As shown in the [Compatibility Matrix](../features/compatibility_matrix.md), most vLLM features are not applicable to
     pooling models as they only work on the generation or decode stage, so performance may not improve as much.
 
 For pooling models, we support the following `--task` options.
@@ -113,10 +112,10 @@ A code example can be found here: <gh-file:examples/offline_inference/basic/scor
 
 ## Online Serving
 
-Our [OpenAI-Compatible Server][serving-openai-compatible-server] provides endpoints that correspond to the offline APIs:
+Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
 
 - [Pooling API][pooling-api] is similar to `LLM.encode`, being applicable to all types of pooling models.
-- [Embeddings API][embeddings-api] is similar to `LLM.embed`, accepting both text and [multi-modal inputs][multimodal-inputs] for embedding models.
+- [Embeddings API][embeddings-api] is similar to `LLM.embed`, accepting both text and [multi-modal inputs](../features/multimodal_inputs.md) for embedding models.
 - [Classification API][classification-api] is similar to `LLM.classify` and is applicable to sequence classification models.
 - [Score API][score-api] is similar to `LLM.score` for cross-encoder models.
 
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index dd9672cc8ab4b..54bed5267c3b0 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -1,7 +1,6 @@
 ---
 title: Supported Models
 ---
-[](){ #supported-models }
 
 vLLM supports [generative](./generative_models.md) and [pooling](./pooling_models.md) models across various tasks.
 If a model supports more than one task, you can set the task via the `--task` argument.
@@ -34,7 +33,7 @@ llm.apply_model(lambda model: print(type(model)))
 If it is `TransformersForCausalLM` then it means it's based on Transformers!
 
 !!! tip
-    You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for [offline-inference][offline-inference] or `--model-impl transformers` for the [openai-compatible-server][serving-openai-compatible-server].
+    You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for [offline-inference](../serving/offline_inference.md) or `--model-impl transformers` for the [openai-compatible-server](../serving/openai_compatible_server.md).
 
 !!! note
     vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM.
@@ -53,8 +52,8 @@ For a model to be compatible with the Transformers backend for vLLM it must:
 
 If the compatible model is:
 
-- on the Hugging Face Model Hub, simply set `trust_remote_code=True` for [offline-inference][offline-inference] or `--trust-remote-code` for the [openai-compatible-server][serving-openai-compatible-server].
-- in a local directory, simply pass directory path to `model=<MODEL_DIR>` for [offline-inference][offline-inference] or `vllm serve <MODEL_DIR>` for the [openai-compatible-server][serving-openai-compatible-server].
+- on the Hugging Face Model Hub, simply set `trust_remote_code=True` for [offline-inference](../serving/offline_inference.md) or `--trust-remote-code` for the [openai-compatible-server](../serving/openai_compatible_server.md).
+- in a local directory, simply pass directory path to `model=<MODEL_DIR>` for [offline-inference](../serving/offline_inference.md) or `vllm serve <MODEL_DIR>` for the [openai-compatible-server](../serving/openai_compatible_server.md).
 
 This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
 
@@ -171,7 +170,7 @@ The [Transformers backend][transformers-backend] enables you to run models direc
 
     If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
 
-Otherwise, please refer to [Adding a New Model][new-model] for instructions on how to implement your model in vLLM.
+Otherwise, please refer to [Adding a New Model](../contributing/model/README.md) for instructions on how to implement your model in vLLM.
 Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
 
 #### Download a model
@@ -308,13 +307,13 @@ print(output)
 
 ### Generative Models
 
-See [this page][generative-models] for more information on how to use generative models.
+See [this page](generative_models.md) for more information on how to use generative models.
 
 #### Text Generation
 
 Specified using `--task generate`.
 
-| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ | ✅︎ |
@@ -412,7 +411,7 @@ See [this page](./pooling_models.md) for more information on how to use pooling
 
 Specified using `--task embed`.
 
-| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `BertModel` | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | |
 | `Gemma2Model` | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | | ✅︎ |
@@ -448,7 +447,7 @@ of the whole prompt are extracted from the normalized hidden state corresponding
 
 Specified using `--task reward`.
 
-| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | ✅︎ |
@@ -466,7 +465,7 @@ If your model is not in the above list, we will try to automatically convert the
 
 Specified using `--task classify`.
 
-| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | |
 | `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ |
@@ -527,7 +526,7 @@ On the other hand, modalities separated by `/` are mutually exclusive.
 
 - e.g.: `T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
 
-See [this page][multimodal-inputs] on how to pass multi-modal inputs to the model.
+See [this page](../features/multimodal_inputs.md) on how to pass multi-modal inputs to the model.
 
 !!! important
     **To enable multiple multi-modal items per text prompt in vLLM V0**, you have to set `limit_mm_per_prompt` (offline inference)
@@ -557,13 +556,13 @@ See [this page][multimodal-inputs] on how to pass multi-modal inputs to the mode
 
 ### Generative Models
 
-See [this page][generative-models] for more information on how to use generative models.
+See [this page](generative_models.md) for more information on how to use generative models.
 
 #### Text Generation
 
 Specified using `--task generate`.
 
-| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
+| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
 |--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | | ✅︎ |
 | `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | ✅︎ |
@@ -685,7 +684,7 @@ Specified using `--task transcription`.
 
 Speech2Text models trained specifically for Automatic Speech Recognition.
 
-| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | |
 
@@ -708,7 +707,7 @@ Any text generation model can be converted into an embedding model by passing `-
 
 The following table lists those that are tested in vLLM.
 
-| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
+| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
 |--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `LlavaNextForConditionalGeneration` | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | | |
 | `Phi3VForCausalLM` | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ | |
diff --git a/docs/serving/distributed_serving.md b/docs/serving/distributed_serving.md
index 6665955411ad5..1ba7a008734f6 100644
--- a/docs/serving/distributed_serving.md
+++ b/docs/serving/distributed_serving.md
@@ -1,7 +1,6 @@
 ---
 title: Distributed Inference and Serving
 ---
-[](){ #distributed-serving }
 
 ## How to decide the distributed inference strategy?
 
diff --git a/docs/serving/integrations/langchain.md b/docs/serving/integrations/langchain.md
index 4783d4fa0b426..6d45623cceb86 100644
--- a/docs/serving/integrations/langchain.md
+++ b/docs/serving/integrations/langchain.md
@@ -1,7 +1,6 @@
 ---
 title: LangChain
 ---
-[](){ #serving-langchain }
 
 vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain) .
 
diff --git a/docs/serving/integrations/llamaindex.md b/docs/serving/integrations/llamaindex.md
index 4feed63bd46b0..1cd36239646da 100644
--- a/docs/serving/integrations/llamaindex.md
+++ b/docs/serving/integrations/llamaindex.md
@@ -1,7 +1,6 @@
 ---
 title: LlamaIndex
 ---
-[](){ #serving-llamaindex }
 
 vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index) .
 
diff --git a/docs/serving/offline_inference.md b/docs/serving/offline_inference.md
index 5b928500bd0f1..695eaa4864589 100644
--- a/docs/serving/offline_inference.md
+++ b/docs/serving/offline_inference.md
@@ -1,7 +1,6 @@
 ---
 title: Offline Inference
 ---
-[](){ #offline-inference }
 
 Offline inference is possible in your own code using vLLM's [`LLM`][vllm.LLM] class.
 
@@ -18,8 +17,8 @@ llm = LLM(model="facebook/opt-125m")
 After initializing the `LLM` instance, use the available APIs to perform model inference.
 The available APIs depend on the model type:
 
-- [Generative models][generative-models] output logprobs which are sampled from to obtain the final output text.
-- [Pooling models][pooling-models] output their hidden states directly.
+- [Generative models](../models/generative_models.md) output logprobs which are sampled from to obtain the final output text.
+- [Pooling models](../models/pooling_models.md) output their hidden states directly.
 
 !!! info
     [API Reference][offline-inference-api]
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 82195ae82f153..85cf08ebef11a 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -1,11 +1,10 @@
 ---
 title: OpenAI-Compatible Server
 ---
-[](){ #serving-openai-compatible-server }
 
 vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more! This functionality lets you serve models and interact with them using an HTTP client.
 
-In your terminal, you can [install](../getting_started/installation/README.md) vLLM, then start the server with the [`vllm serve`][serve-args] command. (You can also use our [Docker][deployment-docker] image.)
+In your terminal, you can [install](../getting_started/installation/README.md) vLLM, then start the server with the [`vllm serve`](../configuration/serve_args.md) command. (You can also use our [Docker](../deployment/docker.md) image.)
 
 ```bash
 vllm serve NousResearch/Meta-Llama-3-8B-Instruct \
@@ -208,7 +207,7 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
 
 We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
 [Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters;
-see our [Multimodal Inputs][multimodal-inputs] guide for more information.
+see our [Multimodal Inputs](../features/multimodal_inputs.md) guide for more information.
 - *Note: `image_url.detail` parameter is not supported.*
 
 Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
diff --git a/docs/usage/faq.md b/docs/usage/faq.md
index 51977d4434f5a..275a7191e60db 100644
--- a/docs/usage/faq.md
+++ b/docs/usage/faq.md
@@ -1,7 +1,6 @@
 ---
 title: Frequently Asked Questions
 ---
-[](){ #faq }
 
 > Q: How can I serve multiple models on a single port using the OpenAI API?
 
@@ -12,7 +11,7 @@ A: Assuming that you're referring to using OpenAI compatible server to serve mul
 > Q: Which model to use for offline inference embedding?
 
 A: You can try [e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) and [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5);
-more are listed [here][supported-models].
+more are listed [here](../models/supported_models.md).
 
 By extracting hidden states, vLLM can automatically convert text generation models like [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B),
 [Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) into embedding models,
diff --git a/docs/usage/metrics.md b/docs/usage/metrics.md
index fa379003c0b2b..d756e32476f0a 100644
--- a/docs/usage/metrics.md
+++ b/docs/usage/metrics.md
@@ -4,7 +4,7 @@ vLLM exposes a number of metrics that can be used to monitor the health of the
 system. These metrics are exposed via the `/metrics` endpoint on the vLLM
 OpenAI compatible API server.
 
-You can start the server using Python, or using [Docker][deployment-docker]:
+You can start the server using Python, or using [Docker](../deployment/docker.md):
 
 ```bash
 vllm serve unsloth/Llama-3.2-1B-Instruct
diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md
index 2d008488ad1eb..e18f808329b0b 100644
--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@@ -1,7 +1,6 @@
 ---
 title: Troubleshooting
 ---
-[](){ #troubleshooting }
 
 This document outlines some troubleshooting strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
 
@@ -267,7 +266,7 @@ or:
 ValueError: Model architectures ['<arch>'] are not supported for now. Supported architectures: [...]
 ```
 
-But you are sure that the model is in the [list of supported models][supported-models], there may be some issue with vLLM's model resolution. In that case, please follow [these steps](../configuration/model_resolution.md) to explicitly specify the vLLM implementation for the model.
+But you are sure that the model is in the [list of supported models](../models/supported_models.md), there may be some issue with vLLM's model resolution. In that case, please follow [these steps](../configuration/model_resolution.md) to explicitly specify the vLLM implementation for the model.
 
 ## Failed to infer device type
 
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index f2a7679f5c512..8b50802e6a8e7 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -90,7 +90,7 @@ vLLM V1 currently excludes model architectures with the `SupportsV0Only` protoco
 
 !!! tip
 
-    This corresponds to the V1 column in our [list of supported models][supported-models].
+    This corresponds to the V1 column in our [list of supported models](../models/supported_models.md).
 
 See below for the status of models that are not yet supported or have more features planned in V1.
 

From b942c094e3ab905aeb16f4136353f378e17159e8 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 8 Jul 2025 11:27:40 +0100
Subject: [PATCH 162/167] Stop using title frontmatter and fix doc that can
 only be reached by search (#20623)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/.nav.yml                                               | 1 +
 docs/community/contact_us.md                                | 4 +---
 docs/community/meetups.md                                   | 4 +---
 docs/configuration/engine_args.md                           | 4 +---
 docs/configuration/serve_args.md                            | 4 +---
 docs/contributing/benchmarks.md                             | 4 +---
 docs/contributing/{ci-failures.md => ci/failures.md}        | 0
 docs/{ => contributing}/ci/update_pytorch_version.md        | 4 +---
 docs/contributing/model/README.md                           | 4 +---
 docs/contributing/model/basic.md                            | 4 +---
 docs/contributing/model/multimodal.md                       | 4 +---
 docs/contributing/model/registration.md                     | 4 +---
 docs/contributing/model/tests.md                            | 4 +---
 docs/deployment/docker.md                                   | 4 +---
 docs/deployment/frameworks/anyscale.md                      | 5 ++---
 docs/deployment/frameworks/anything-llm.md                  | 4 +---
 docs/deployment/frameworks/autogen.md                       | 4 +---
 docs/deployment/frameworks/bentoml.md                       | 4 +---
 docs/deployment/frameworks/cerebrium.md                     | 4 +---
 docs/deployment/frameworks/chatbox.md                       | 4 +---
 docs/deployment/frameworks/dify.md                          | 4 +---
 docs/deployment/frameworks/dstack.md                        | 4 +---
 docs/deployment/frameworks/haystack.md                      | 4 +---
 docs/deployment/frameworks/helm.md                          | 4 +---
 docs/deployment/frameworks/litellm.md                       | 4 +---
 docs/deployment/frameworks/lobe-chat.md                     | 4 +---
 docs/deployment/frameworks/lws.md                           | 4 +---
 docs/deployment/frameworks/modal.md                         | 4 +---
 docs/deployment/frameworks/open-webui.md                    | 4 +---
 .../deployment/frameworks/retrieval_augmented_generation.md | 4 +---
 docs/deployment/frameworks/skypilot.md                      | 4 +---
 docs/deployment/frameworks/streamlit.md                     | 4 +---
 docs/deployment/frameworks/triton.md                        | 4 +---
 docs/deployment/integrations/kserve.md                      | 4 +---
 docs/deployment/integrations/kubeai.md                      | 4 +---
 docs/deployment/integrations/llamastack.md                  | 4 +---
 docs/deployment/integrations/llmaz.md                       | 4 +---
 docs/deployment/integrations/production-stack.md            | 4 +---
 docs/deployment/k8s.md                                      | 4 +---
 docs/deployment/nginx.md                                    | 4 +---
 docs/design/arch_overview.md                                | 4 +---
 docs/design/automatic_prefix_caching.md                     | 4 +---
 docs/design/huggingface_integration.md                      | 4 +---
 docs/design/kernel/paged_attention.md                       | 4 +---
 docs/design/mm_processing.md                                | 4 +---
 docs/design/plugin_system.md                                | 4 +---
 docs/features/automatic_prefix_caching.md                   | 4 +---
 docs/features/compatibility_matrix.md                       | 4 +---
 docs/features/disagg_prefill.md                             | 4 +---
 docs/features/lora.md                                       | 4 +---
 docs/features/multimodal_inputs.md                          | 4 +---
 docs/features/quantization/README.md                        | 4 +---
 docs/features/quantization/auto_awq.md                      | 4 +---
 docs/features/quantization/bitblas.md                       | 4 +---
 docs/features/quantization/bnb.md                           | 4 +---
 docs/features/quantization/fp8.md                           | 4 +---
 docs/features/quantization/gguf.md                          | 4 +---
 docs/features/quantization/gptqmodel.md                     | 4 +---
 docs/features/quantization/int4.md                          | 4 +---
 docs/features/quantization/int8.md                          | 4 +---
 docs/features/quantization/quantized_kvcache.md             | 4 +---
 docs/features/quantization/quark.md                         | 4 +---
 docs/features/quantization/supported_hardware.md            | 4 +---
 docs/features/reasoning_outputs.md                          | 4 +---
 docs/features/spec_decode.md                                | 4 +---
 docs/features/structured_outputs.md                         | 4 +---
 docs/getting_started/installation/README.md                 | 4 +---
 docs/getting_started/quickstart.md                          | 4 +---
 docs/models/extensions/runai_model_streamer.md              | 4 +---
 docs/models/extensions/tensorizer.md                        | 4 +---
 docs/models/generative_models.md                            | 4 +---
 docs/models/hardware_supported_models/tpu.md                | 4 +---
 docs/models/pooling_models.md                               | 4 +---
 docs/models/supported_models.md                             | 4 +---
 docs/serving/distributed_serving.md                         | 4 +---
 docs/serving/integrations/langchain.md                      | 4 +---
 docs/serving/integrations/llamaindex.md                     | 4 +---
 docs/serving/offline_inference.md                           | 6 ++----
 docs/serving/openai_compatible_server.md                    | 4 +---
 docs/usage/faq.md                                           | 4 +---
 docs/usage/troubleshooting.md                               | 4 +---
 81 files changed, 82 insertions(+), 238 deletions(-)
 rename docs/contributing/{ci-failures.md => ci/failures.md} (100%)
 rename docs/{ => contributing}/ci/update_pytorch_version.md (99%)

diff --git a/docs/.nav.yml b/docs/.nav.yml
index 06bfcc3f1effe..ab54dc3e535bd 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -55,6 +55,7 @@ nav:
       - contributing/model/registration.md
       - contributing/model/tests.md
       - contributing/model/multimodal.md
+    - CI: contributing/ci
     - Design Documents:
       - V0: design
       - V1: design/v1
diff --git a/docs/community/contact_us.md b/docs/community/contact_us.md
index f26e312b64e70..04c28cde5f6b0 100644
--- a/docs/community/contact_us.md
+++ b/docs/community/contact_us.md
@@ -1,5 +1,3 @@
----
-title: Contact Us
----
+# Contact Us
 
 --8<-- "README.md:contact-us"
diff --git a/docs/community/meetups.md b/docs/community/meetups.md
index 89de4574d79e4..e8b3a9c9c8e69 100644
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@@ -1,6 +1,4 @@
----
-title: Meetups
----
+# Meetups
 
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
diff --git a/docs/configuration/engine_args.md b/docs/configuration/engine_args.md
index 579a4731cacae..a0e3594cd5813 100644
--- a/docs/configuration/engine_args.md
+++ b/docs/configuration/engine_args.md
@@ -1,6 +1,4 @@
----
-title: Engine Arguments
----
+# Engine Arguments
 
 Engine arguments control the behavior of the vLLM engine.
 
diff --git a/docs/configuration/serve_args.md b/docs/configuration/serve_args.md
index 4a7d771c5b8f1..142d4b8af898e 100644
--- a/docs/configuration/serve_args.md
+++ b/docs/configuration/serve_args.md
@@ -1,6 +1,4 @@
----
-title: Server Arguments
----
+# Server Arguments
 
 The `vllm serve` command is used to launch the OpenAI-compatible server.
 
diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md
index d0fbfa13cb94a..0ebd99ba5ae12 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@@ -1,6 +1,4 @@
----
-title: Benchmark Suites
----
+# Benchmark Suites
 
 vLLM contains two sets of benchmarks:
 
diff --git a/docs/contributing/ci-failures.md b/docs/contributing/ci/failures.md
similarity index 100%
rename from docs/contributing/ci-failures.md
rename to docs/contributing/ci/failures.md
diff --git a/docs/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md
similarity index 99%
rename from docs/ci/update_pytorch_version.md
rename to docs/contributing/ci/update_pytorch_version.md
index eb8f194557912..2327bc4b53ad2 100644
--- a/docs/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@@ -1,6 +1,4 @@
----
-title: Update PyTorch version on vLLM OSS CI/CD
----
+# Update PyTorch version on vLLM OSS CI/CD
 
 vLLM's current policy is to always use the latest PyTorch stable
 release in CI/CD. It is standard practice to submit a PR to update the
diff --git a/docs/contributing/model/README.md b/docs/contributing/model/README.md
index dd0e3e701d50b..0ca77fa499db7 100644
--- a/docs/contributing/model/README.md
+++ b/docs/contributing/model/README.md
@@ -1,6 +1,4 @@
----
-title: Summary
----
+# Summary
 
 !!! important
     Many decoder language models can now be automatically loaded using the [Transformers backend][transformers-backend] without having to implement them in vLLM. See if `vllm serve <model>` works first!
diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md
index f4f3085dc4e2a..542351fd66bb0 100644
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -1,6 +1,4 @@
----
-title: Basic Model
----
+# Basic Model
 
 This guide walks you through the steps to implement a basic vLLM model.
 
diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md
index ced1480ddcc47..3295b8c711c0c 100644
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@@ -1,6 +1,4 @@
----
-title: Multi-Modal Support
----
+# Multi-Modal Support
 
 This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs](../../features/multimodal_inputs.md).
 
diff --git a/docs/contributing/model/registration.md b/docs/contributing/model/registration.md
index 46f50a6ec90de..35f35ffa4cde6 100644
--- a/docs/contributing/model/registration.md
+++ b/docs/contributing/model/registration.md
@@ -1,6 +1,4 @@
----
-title: Registering a Model
----
+# Registering a Model
 
 vLLM relies on a model registry to determine how to run each model.
 A list of pre-registered architectures can be found [here](../../models/supported_models.md).
diff --git a/docs/contributing/model/tests.md b/docs/contributing/model/tests.md
index 134a73449be6d..1206ad36771ea 100644
--- a/docs/contributing/model/tests.md
+++ b/docs/contributing/model/tests.md
@@ -1,6 +1,4 @@
----
-title: Unit Testing
----
+# Unit Testing
 
 This page explains how to write unit tests to verify the implementation of your model.
 
diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md
index daf2031938647..e500751896b34 100644
--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@@ -1,6 +1,4 @@
----
-title: Using Docker
----
+# Using Docker
 
 [](){ #deployment-docker-pre-built-image }
 
diff --git a/docs/deployment/frameworks/anyscale.md b/docs/deployment/frameworks/anyscale.md
index 2ee325782ac0c..5604f7f96157d 100644
--- a/docs/deployment/frameworks/anyscale.md
+++ b/docs/deployment/frameworks/anyscale.md
@@ -1,6 +1,5 @@
----
-title: Anyscale
----
+# Anyscale
+
 [](){ #deployment-anyscale }
 
 [Anyscale](https://www.anyscale.com) is a managed, multi-cloud platform developed by the creators of Ray.
diff --git a/docs/deployment/frameworks/anything-llm.md b/docs/deployment/frameworks/anything-llm.md
index 6cead082e1af0..d6b28a358cc3d 100644
--- a/docs/deployment/frameworks/anything-llm.md
+++ b/docs/deployment/frameworks/anything-llm.md
@@ -1,6 +1,4 @@
----
-title: Anything LLM
----
+# Anything LLM
 
 [Anything LLM](https://github.com/Mintplex-Labs/anything-llm) is a full-stack application that enables you to turn any document, resource, or piece of content into context that any LLM can use as references during chatting.
 
diff --git a/docs/deployment/frameworks/autogen.md b/docs/deployment/frameworks/autogen.md
index 8510d063b8391..c255a85d38401 100644
--- a/docs/deployment/frameworks/autogen.md
+++ b/docs/deployment/frameworks/autogen.md
@@ -1,6 +1,4 @@
----
-title: AutoGen
----
+# AutoGen
 
 [AutoGen](https://github.com/microsoft/autogen) is a framework for creating multi-agent AI applications that can act autonomously or work alongside humans.
 
diff --git a/docs/deployment/frameworks/bentoml.md b/docs/deployment/frameworks/bentoml.md
index a11fc4804e44f..9c8f2527f2e2a 100644
--- a/docs/deployment/frameworks/bentoml.md
+++ b/docs/deployment/frameworks/bentoml.md
@@ -1,6 +1,4 @@
----
-title: BentoML
----
+# BentoML
 
 [BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-compliant image and deploy it on Kubernetes.
 
diff --git a/docs/deployment/frameworks/cerebrium.md b/docs/deployment/frameworks/cerebrium.md
index 3a8d6627312bd..1f233c3204a15 100644
--- a/docs/deployment/frameworks/cerebrium.md
+++ b/docs/deployment/frameworks/cerebrium.md
@@ -1,6 +1,4 @@
----
-title: Cerebrium
----
+# Cerebrium
 
 <p align="center">
     <img src="https://i.ibb.co/hHcScTT/Screenshot-2024-06-13-at-10-14-54.png" alt="vLLM_plus_cerebrium"/>
diff --git a/docs/deployment/frameworks/chatbox.md b/docs/deployment/frameworks/chatbox.md
index 0dd97633b382d..15f92ed1e34df 100644
--- a/docs/deployment/frameworks/chatbox.md
+++ b/docs/deployment/frameworks/chatbox.md
@@ -1,6 +1,4 @@
----
-title: Chatbox
----
+# Chatbox
 
 [Chatbox](https://github.com/chatboxai/chatbox) is a desktop client for LLMs, available on Windows, Mac, Linux.
 
diff --git a/docs/deployment/frameworks/dify.md b/docs/deployment/frameworks/dify.md
index e08fdafb6c843..a3063194fb513 100644
--- a/docs/deployment/frameworks/dify.md
+++ b/docs/deployment/frameworks/dify.md
@@ -1,6 +1,4 @@
----
-title: Dify
----
+# Dify
 
 [Dify](https://github.com/langgenius/dify) is an open-source LLM app development platform. Its intuitive interface combines agentic AI workflow, RAG pipeline, agent capabilities, model management, observability features, and more, allowing you to quickly move from prototype to production.
 
diff --git a/docs/deployment/frameworks/dstack.md b/docs/deployment/frameworks/dstack.md
index 750df67223cb8..23dc58c974ed8 100644
--- a/docs/deployment/frameworks/dstack.md
+++ b/docs/deployment/frameworks/dstack.md
@@ -1,6 +1,4 @@
----
-title: dstack
----
+# dstack
 
 <p align="center">
     <img src="https://i.ibb.co/71kx6hW/vllm-dstack.png" alt="vLLM_plus_dstack"/>
diff --git a/docs/deployment/frameworks/haystack.md b/docs/deployment/frameworks/haystack.md
index d069bda0e815e..a18d68142cabb 100644
--- a/docs/deployment/frameworks/haystack.md
+++ b/docs/deployment/frameworks/haystack.md
@@ -1,6 +1,4 @@
----
-title: Haystack
----
+# Haystack
 
 # Haystack
 
diff --git a/docs/deployment/frameworks/helm.md b/docs/deployment/frameworks/helm.md
index 4dacfdf352df7..e5d44945ba725 100644
--- a/docs/deployment/frameworks/helm.md
+++ b/docs/deployment/frameworks/helm.md
@@ -1,6 +1,4 @@
----
-title: Helm
----
+# Helm
 
 A Helm chart to deploy vLLM for Kubernetes
 
diff --git a/docs/deployment/frameworks/litellm.md b/docs/deployment/frameworks/litellm.md
index 8499cebc6fd02..c7e514f2276e0 100644
--- a/docs/deployment/frameworks/litellm.md
+++ b/docs/deployment/frameworks/litellm.md
@@ -1,6 +1,4 @@
----
-title: LiteLLM
----
+# LiteLLM
 
 [LiteLLM](https://github.com/BerriAI/litellm) call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, Groq etc.]
 
diff --git a/docs/deployment/frameworks/lobe-chat.md b/docs/deployment/frameworks/lobe-chat.md
index 22e62ad615ae5..e3e7dbe6e1e80 100644
--- a/docs/deployment/frameworks/lobe-chat.md
+++ b/docs/deployment/frameworks/lobe-chat.md
@@ -1,6 +1,4 @@
----
-title: Lobe Chat
----
+# Lobe Chat
 
 [Lobe Chat](https://github.com/lobehub/lobe-chat) is an open-source, modern-design ChatGPT/LLMs UI/Framework.
 
diff --git a/docs/deployment/frameworks/lws.md b/docs/deployment/frameworks/lws.md
index 633949bf32d8b..3319dc6c90e1e 100644
--- a/docs/deployment/frameworks/lws.md
+++ b/docs/deployment/frameworks/lws.md
@@ -1,6 +1,4 @@
----
-title: LWS
----
+# LWS
 
 LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads.
 A major use case is for multi-host/multi-node distributed inference.
diff --git a/docs/deployment/frameworks/modal.md b/docs/deployment/frameworks/modal.md
index feb6f698009d9..0ab5ed92fe6bd 100644
--- a/docs/deployment/frameworks/modal.md
+++ b/docs/deployment/frameworks/modal.md
@@ -1,6 +1,4 @@
----
-title: Modal
----
+# Modal
 
 vLLM can be run on cloud GPUs with [Modal](https://modal.com), a serverless computing platform designed for fast auto-scaling.
 
diff --git a/docs/deployment/frameworks/open-webui.md b/docs/deployment/frameworks/open-webui.md
index 53d21b4325611..8f27a2b9bb6ee 100644
--- a/docs/deployment/frameworks/open-webui.md
+++ b/docs/deployment/frameworks/open-webui.md
@@ -1,6 +1,4 @@
----
-title: Open WebUI
----
+# Open WebUI
 
 1. Install the [Docker](https://docs.docker.com/engine/install/)
 
diff --git a/docs/deployment/frameworks/retrieval_augmented_generation.md b/docs/deployment/frameworks/retrieval_augmented_generation.md
index 059bdf0309723..96dd99e7118b6 100644
--- a/docs/deployment/frameworks/retrieval_augmented_generation.md
+++ b/docs/deployment/frameworks/retrieval_augmented_generation.md
@@ -1,6 +1,4 @@
----
-title: Retrieval-Augmented Generation
----
+# Retrieval-Augmented Generation
 
 [Retrieval-augmented generation (RAG)](https://en.wikipedia.org/wiki/Retrieval-augmented_generation) is a technique that enables generative artificial intelligence (Gen AI) models to retrieve and incorporate new information. It modifies interactions with a large language model (LLM) so that the model responds to user queries with reference to a specified set of documents, using this information to supplement information from its pre-existing training data. This allows LLMs to use domain-specific and/or updated information. Use cases include providing chatbot access to internal company data or generating responses based on authoritative sources.
 
diff --git a/docs/deployment/frameworks/skypilot.md b/docs/deployment/frameworks/skypilot.md
index ffa59a17e2fa0..06e2fed38f056 100644
--- a/docs/deployment/frameworks/skypilot.md
+++ b/docs/deployment/frameworks/skypilot.md
@@ -1,6 +1,4 @@
----
-title: SkyPilot
----
+# SkyPilot
 
 <p align="center">
   <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
diff --git a/docs/deployment/frameworks/streamlit.md b/docs/deployment/frameworks/streamlit.md
index 6445ab68e3411..af0f0690c68e2 100644
--- a/docs/deployment/frameworks/streamlit.md
+++ b/docs/deployment/frameworks/streamlit.md
@@ -1,6 +1,4 @@
----
-title: Streamlit
----
+# Streamlit
 
 [Streamlit](https://github.com/streamlit/streamlit) lets you transform Python scripts into interactive web apps in minutes, instead of weeks. Build dashboards, generate reports, or create chat apps.
 
diff --git a/docs/deployment/frameworks/triton.md b/docs/deployment/frameworks/triton.md
index ef6b6f9325b92..faff4a4263eb2 100644
--- a/docs/deployment/frameworks/triton.md
+++ b/docs/deployment/frameworks/triton.md
@@ -1,5 +1,3 @@
----
-title: NVIDIA Triton
----
+# NVIDIA Triton
 
 The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details.
diff --git a/docs/deployment/integrations/kserve.md b/docs/deployment/integrations/kserve.md
index b61112b3a91bd..edf79fca4f93e 100644
--- a/docs/deployment/integrations/kserve.md
+++ b/docs/deployment/integrations/kserve.md
@@ -1,6 +1,4 @@
----
-title: KServe
----
+# KServe
 
 vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
 
diff --git a/docs/deployment/integrations/kubeai.md b/docs/deployment/integrations/kubeai.md
index 37604b8feef4c..89d072215e956 100644
--- a/docs/deployment/integrations/kubeai.md
+++ b/docs/deployment/integrations/kubeai.md
@@ -1,6 +1,4 @@
----
-title: KubeAI
----
+# KubeAI
 
 [KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.
 
diff --git a/docs/deployment/integrations/llamastack.md b/docs/deployment/integrations/llamastack.md
index cf328054621d8..28031f01f85e8 100644
--- a/docs/deployment/integrations/llamastack.md
+++ b/docs/deployment/integrations/llamastack.md
@@ -1,6 +1,4 @@
----
-title: Llama Stack
----
+# Llama Stack
 
 vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) .
 
diff --git a/docs/deployment/integrations/llmaz.md b/docs/deployment/integrations/llmaz.md
index 87772ec6ce088..77730a26c24fc 100644
--- a/docs/deployment/integrations/llmaz.md
+++ b/docs/deployment/integrations/llmaz.md
@@ -1,6 +1,4 @@
----
-title: llmaz
----
+# llmaz
 
 [llmaz](https://github.com/InftyAI/llmaz) is an easy-to-use and advanced inference platform for large language models on Kubernetes, aimed for production use. It uses vLLM as the default model serving backend.
 
diff --git a/docs/deployment/integrations/production-stack.md b/docs/deployment/integrations/production-stack.md
index 19371061a5c10..ffec679207fd8 100644
--- a/docs/deployment/integrations/production-stack.md
+++ b/docs/deployment/integrations/production-stack.md
@@ -1,6 +1,4 @@
----
-title: Production stack
----
+# Production stack
 
 Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using the [vLLM production stack](https://github.com/vllm-project/production-stack). Born out of a Berkeley-UChicago collaboration, [vLLM production stack](https://github.com/vllm-project/production-stack) is an officially released, production-optimized codebase under the [vLLM project](https://github.com/vllm-project), designed for LLM deployment with:
 
diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md
index 8eb69527c8472..8eb2270ab7c87 100644
--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@@ -1,6 +1,4 @@
----
-title: Using Kubernetes
----
+# Using Kubernetes
 
 Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes.
 
diff --git a/docs/deployment/nginx.md b/docs/deployment/nginx.md
index 2cdf766d11950..b3178e77f845c 100644
--- a/docs/deployment/nginx.md
+++ b/docs/deployment/nginx.md
@@ -1,6 +1,4 @@
----
-title: Using Nginx
----
+# Using Nginx
 
 This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
 
diff --git a/docs/design/arch_overview.md b/docs/design/arch_overview.md
index 27676bc2e919f..334df5dc9b7f0 100644
--- a/docs/design/arch_overview.md
+++ b/docs/design/arch_overview.md
@@ -1,6 +1,4 @@
----
-title: Architecture Overview
----
+# Architecture Overview
 
 This document provides an overview of the vLLM architecture.
 
diff --git a/docs/design/automatic_prefix_caching.md b/docs/design/automatic_prefix_caching.md
index 88b3d0b66e70d..60e21f6ad0fcb 100644
--- a/docs/design/automatic_prefix_caching.md
+++ b/docs/design/automatic_prefix_caching.md
@@ -1,6 +1,4 @@
----
-title: Automatic Prefix Caching
----
+# Automatic Prefix Caching
 
 The core idea of [PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
 
diff --git a/docs/design/huggingface_integration.md b/docs/design/huggingface_integration.md
index 100f931ec6123..7b01313ddb00a 100644
--- a/docs/design/huggingface_integration.md
+++ b/docs/design/huggingface_integration.md
@@ -1,6 +1,4 @@
----
-title: Integration with HuggingFace
----
+# Integration with HuggingFace
 
 This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run `vllm serve`.
 
diff --git a/docs/design/kernel/paged_attention.md b/docs/design/kernel/paged_attention.md
index bd81d817895d5..94bfa97ee2217 100644
--- a/docs/design/kernel/paged_attention.md
+++ b/docs/design/kernel/paged_attention.md
@@ -1,6 +1,4 @@
----
-title: vLLM Paged Attention
----
+# vLLM Paged Attention
 
 Currently, vLLM utilizes its own implementation of a multi-head query
 attention kernel (`csrc/attention/attention_kernels.cu`).
diff --git a/docs/design/mm_processing.md b/docs/design/mm_processing.md
index 75c986269df5a..1e9b6ad6e821e 100644
--- a/docs/design/mm_processing.md
+++ b/docs/design/mm_processing.md
@@ -1,6 +1,4 @@
----
-title: Multi-Modal Data Processing
----
+# Multi-Modal Data Processing
 
 To enable various optimizations in vLLM such as [chunked prefill][chunked-prefill] and [prefix caching](../features/automatic_prefix_caching.md), we use [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
 
diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md
index 35372b5ea03bc..23a05ac719ce2 100644
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -1,6 +1,4 @@
----
-title: vLLM's Plugin System
----
+# vLLM's Plugin System
 
 The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM.
 
diff --git a/docs/features/automatic_prefix_caching.md b/docs/features/automatic_prefix_caching.md
index 73ff1757371fa..f3c4bdd85c378 100644
--- a/docs/features/automatic_prefix_caching.md
+++ b/docs/features/automatic_prefix_caching.md
@@ -1,6 +1,4 @@
----
-title: Automatic Prefix Caching
----
+# Automatic Prefix Caching
 
 ## Introduction
 
diff --git a/docs/features/compatibility_matrix.md b/docs/features/compatibility_matrix.md
index d71e9fafd6298..fdd75bfe33d4c 100644
--- a/docs/features/compatibility_matrix.md
+++ b/docs/features/compatibility_matrix.md
@@ -1,6 +1,4 @@
----
-title: Compatibility Matrix
----
+# Compatibility Matrix
 
 The tables below show mutually exclusive features and the support on some hardware.
 
diff --git a/docs/features/disagg_prefill.md b/docs/features/disagg_prefill.md
index 5b45b676ee90d..c0c32594f266c 100644
--- a/docs/features/disagg_prefill.md
+++ b/docs/features/disagg_prefill.md
@@ -1,6 +1,4 @@
----
-title: Disaggregated Prefilling (experimental)
----
+# Disaggregated Prefilling (experimental)
 
 This page introduces you the disaggregated prefilling feature in vLLM.
 
diff --git a/docs/features/lora.md b/docs/features/lora.md
index 5ede7c42976c7..3e17c659655e5 100644
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -1,6 +1,4 @@
----
-title: LoRA Adapters
----
+# LoRA Adapters
 
 This document shows you how to use [LoRA adapters](https://arxiv.org/abs/2106.09685) with vLLM on top of a base model.
 
diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index 644c9d03af97c..f9df2c89c6007 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -1,6 +1,4 @@
----
-title: Multimodal Inputs
----
+# Multimodal Inputs
 
 This page teaches you how to pass multi-modal inputs to [multi-modal models][supported-mm-models] in vLLM.
 
diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md
index 73d54b8dca851..c30abdab5d612 100644
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@@ -1,6 +1,4 @@
----
-title: Quantization
----
+# Quantization
 
 Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices.
 
diff --git a/docs/features/quantization/auto_awq.md b/docs/features/quantization/auto_awq.md
index 97227e54c356c..fc998387d29aa 100644
--- a/docs/features/quantization/auto_awq.md
+++ b/docs/features/quantization/auto_awq.md
@@ -1,6 +1,4 @@
----
-title: AutoAWQ
----
+# AutoAWQ
 
 To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
 Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint.
diff --git a/docs/features/quantization/bitblas.md b/docs/features/quantization/bitblas.md
index 8ad1e1dea299b..ba014d28cde4a 100644
--- a/docs/features/quantization/bitblas.md
+++ b/docs/features/quantization/bitblas.md
@@ -1,6 +1,4 @@
----
-title: BitBLAS
----
+# BitBLAS
 
 vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more efficient and flexible model inference. Compared to other quantization frameworks, BitBLAS provides more precision combinations.
 
diff --git a/docs/features/quantization/bnb.md b/docs/features/quantization/bnb.md
index 11c37547863b3..3b15a6072d47a 100644
--- a/docs/features/quantization/bnb.md
+++ b/docs/features/quantization/bnb.md
@@ -1,6 +1,4 @@
----
-title: BitsAndBytes
----
+# BitsAndBytes
 
 vLLM now supports [BitsAndBytes](https://github.com/TimDettmers/bitsandbytes) for more efficient model inference.
 BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy.
diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md
index 03aec160ea1ca..a6c0fd78e76b6 100644
--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@@ -1,6 +1,4 @@
----
-title: FP8 W8A8
----
+# FP8 W8A8
 
 vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x.
 Currently, only Hopper and Ada Lovelace GPUs are officially supported for W8A8.
diff --git a/docs/features/quantization/gguf.md b/docs/features/quantization/gguf.md
index 564b999fecd9c..2a1c3bdd775f1 100644
--- a/docs/features/quantization/gguf.md
+++ b/docs/features/quantization/gguf.md
@@ -1,6 +1,4 @@
----
-title: GGUF
----
+# GGUF
 
 !!! warning
     Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team.
diff --git a/docs/features/quantization/gptqmodel.md b/docs/features/quantization/gptqmodel.md
index 402e0cb3b2bf9..47cb2d65bae47 100644
--- a/docs/features/quantization/gptqmodel.md
+++ b/docs/features/quantization/gptqmodel.md
@@ -1,6 +1,4 @@
----
-title: GPTQModel
----
+# GPTQModel
 
 To create a new 4-bit or 8-bit GPTQ quantized model, you can leverage [GPTQModel](https://github.com/ModelCloud/GPTQModel) from ModelCloud.AI.
 
diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md
index a76852cf82312..f26de73c2f0fa 100644
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@@ -1,6 +1,4 @@
----
-title: INT4 W4A16
----
+# INT4 W4A16
 
 vLLM supports quantizing weights to INT4 for memory savings and inference acceleration. This quantization method is particularly useful for reducing model size and maintaining low latency in workloads with low queries per second (QPS).
 
diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md
index e1ced47ab9155..7e1cb3fee94a3 100644
--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@@ -1,6 +1,4 @@
----
-title: INT8 W8A8
----
+# INT8 W8A8
 
 vLLM supports quantizing weights and activations to INT8 for memory savings and inference acceleration.
 This quantization method is particularly useful for reducing model size while maintaining good performance.
diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md
index 2b0622f197482..c54ec43658a43 100644
--- a/docs/features/quantization/quantized_kvcache.md
+++ b/docs/features/quantization/quantized_kvcache.md
@@ -1,6 +1,4 @@
----
-title: Quantized KV Cache
----
+# Quantized KV Cache
 
 ## FP8 KV Cache
 
diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md
index 288a636326c99..2c48f9b546b83 100644
--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@@ -1,6 +1,4 @@
----
-title: AMD Quark
----
+# AMD Quark
 
 Quantization can effectively reduce memory and bandwidth usage, accelerate computation and improve
 throughput while with minimal accuracy loss. vLLM can leverage [Quark](https://quark.docs.amd.com/latest/),
diff --git a/docs/features/quantization/supported_hardware.md b/docs/features/quantization/supported_hardware.md
index d66972792d574..bb4fe5b54b57b 100644
--- a/docs/features/quantization/supported_hardware.md
+++ b/docs/features/quantization/supported_hardware.md
@@ -1,6 +1,4 @@
----
-title: Supported Hardware
----
+# Supported Hardware
 
 The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
 
diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index d6ee2955b8965..7ab7efd5e7656 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -1,6 +1,4 @@
----
-title: Reasoning Outputs
----
+# Reasoning Outputs
 
 vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions.
 
diff --git a/docs/features/spec_decode.md b/docs/features/spec_decode.md
index 9c63974d0e2ad..4be6bd01a4eb7 100644
--- a/docs/features/spec_decode.md
+++ b/docs/features/spec_decode.md
@@ -1,6 +1,4 @@
----
-title: Speculative Decoding
----
+# Speculative Decoding
 
 !!! warning
     Please note that speculative decoding in vLLM is not yet optimized and does
diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
index 84d6ea4fe51e4..4f737afa80f55 100644
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -1,6 +1,4 @@
----
-title: Structured Outputs
----
+# Structured Outputs
 
 vLLM supports the generation of structured outputs using
 [xgrammar](https://github.com/mlc-ai/xgrammar) or
diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md
index 274e7560e46fe..a252343dcee8a 100644
--- a/docs/getting_started/installation/README.md
+++ b/docs/getting_started/installation/README.md
@@ -1,6 +1,4 @@
----
-title: Installation
----
+# Installation
 
 vLLM supports the following hardware platforms:
 
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index 2decd15f033e8..74235db16a15d 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -1,6 +1,4 @@
----
-title: Quickstart
----
+# Quickstart
 
 This guide will help you quickly get started with vLLM to perform:
 
diff --git a/docs/models/extensions/runai_model_streamer.md b/docs/models/extensions/runai_model_streamer.md
index b0affe7a4b11d..992dddf385d0d 100644
--- a/docs/models/extensions/runai_model_streamer.md
+++ b/docs/models/extensions/runai_model_streamer.md
@@ -1,6 +1,4 @@
----
-title: Loading models with Run:ai Model Streamer
----
+# Loading models with Run:ai Model Streamer
 
 Run:ai Model Streamer is a library to read tensors in concurrency, while streaming it to GPU memory.
 Further reading can be found in [Run:ai Model Streamer Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/README.md).
diff --git a/docs/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md
index 09afca3966e54..5aa647b199275 100644
--- a/docs/models/extensions/tensorizer.md
+++ b/docs/models/extensions/tensorizer.md
@@ -1,6 +1,4 @@
----
-title: Loading models with CoreWeave's Tensorizer
----
+# Loading models with CoreWeave's Tensorizer
 
 vLLM supports loading models with [CoreWeave's Tensorizer](https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer).
 vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized
diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md
index e51b56fa6b7cf..21ad115e411a3 100644
--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@@ -1,6 +1,4 @@
----
-title: Generative Models
----
+# Generative Models
 
 vLLM provides first-class support for generative models, which covers most of LLMs.
 
diff --git a/docs/models/hardware_supported_models/tpu.md b/docs/models/hardware_supported_models/tpu.md
index 1e0449b5fdeb5..da03a3b3160ad 100644
--- a/docs/models/hardware_supported_models/tpu.md
+++ b/docs/models/hardware_supported_models/tpu.md
@@ -1,6 +1,4 @@
----
-title: TPU
----
+# TPU
 
 # TPU Supported Models
 ## Text-only Language Models
diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index c659fc567927d..f0de84a66f8b0 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -1,6 +1,4 @@
----
-title: Pooling Models
----
+# Pooling Models
 
 vLLM also supports pooling models, including embedding, reranking and reward models.
 
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 54bed5267c3b0..e003a3e31717a 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -1,6 +1,4 @@
----
-title: Supported Models
----
+# Supported Models
 
 vLLM supports [generative](./generative_models.md) and [pooling](./pooling_models.md) models across various tasks.
 If a model supports more than one task, you can set the task via the `--task` argument.
diff --git a/docs/serving/distributed_serving.md b/docs/serving/distributed_serving.md
index 1ba7a008734f6..8012500dfbf9f 100644
--- a/docs/serving/distributed_serving.md
+++ b/docs/serving/distributed_serving.md
@@ -1,6 +1,4 @@
----
-title: Distributed Inference and Serving
----
+# Distributed Inference and Serving
 
 ## How to decide the distributed inference strategy?
 
diff --git a/docs/serving/integrations/langchain.md b/docs/serving/integrations/langchain.md
index 6d45623cceb86..47074f411ac99 100644
--- a/docs/serving/integrations/langchain.md
+++ b/docs/serving/integrations/langchain.md
@@ -1,6 +1,4 @@
----
-title: LangChain
----
+# LangChain
 
 vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain) .
 
diff --git a/docs/serving/integrations/llamaindex.md b/docs/serving/integrations/llamaindex.md
index 1cd36239646da..4b838cbcaa9d1 100644
--- a/docs/serving/integrations/llamaindex.md
+++ b/docs/serving/integrations/llamaindex.md
@@ -1,6 +1,4 @@
----
-title: LlamaIndex
----
+# LlamaIndex
 
 vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index) .
 
diff --git a/docs/serving/offline_inference.md b/docs/serving/offline_inference.md
index 695eaa4864589..4ec879e0bc8a5 100644
--- a/docs/serving/offline_inference.md
+++ b/docs/serving/offline_inference.md
@@ -1,6 +1,4 @@
----
-title: Offline Inference
----
+# Offline Inference
 
 Offline inference is possible in your own code using vLLM's [`LLM`][vllm.LLM] class.
 
@@ -23,7 +21,7 @@ The available APIs depend on the model type:
 !!! info
     [API Reference][offline-inference-api]
 
-### Ray Data LLM API
+## Ray Data LLM API
 
 Ray Data LLM is an alternative offline inference API that uses vLLM as the underlying engine.
 This API adds several batteries-included capabilities that simplify large-scale, GPU-efficient inference:
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 85cf08ebef11a..cebef2b6a2d60 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -1,6 +1,4 @@
----
-title: OpenAI-Compatible Server
----
+# OpenAI-Compatible Server
 
 vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more! This functionality lets you serve models and interact with them using an HTTP client.
 
diff --git a/docs/usage/faq.md b/docs/usage/faq.md
index 275a7191e60db..2c8680cb6f7b5 100644
--- a/docs/usage/faq.md
+++ b/docs/usage/faq.md
@@ -1,6 +1,4 @@
----
-title: Frequently Asked Questions
----
+# Frequently Asked Questions
 
 > Q: How can I serve multiple models on a single port using the OpenAI API?
 
diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md
index e18f808329b0b..f9ba32c58c4e1 100644
--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@@ -1,6 +1,4 @@
----
-title: Troubleshooting
----
+# Troubleshooting
 
 This document outlines some troubleshooting strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
 

From a4c23314c04a0ce3e507cd199d6372fb83cb6732 Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Tue, 8 Jul 2025 22:07:10 +0800
Subject: [PATCH 163/167] [xpu]feat: support multi-lora on xpu (#20616)

Signed-off-by: yan <yan.ma@intel.com>
---
 vllm/lora/ops/triton_ops/lora_expand_op.py     |  2 ++
 vllm/lora/ops/triton_ops/lora_shrink_op.py     |  2 ++
 vllm/lora/ops/triton_ops/utils.py              | 12 +++++++++---
 vllm/model_executor/model_loader/tensorizer.py |  5 ++++-
 vllm/platforms/xpu.py                          | 11 +++++++++++
 5 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/vllm/lora/ops/triton_ops/lora_expand_op.py b/vllm/lora/ops/triton_ops/lora_expand_op.py
index 9e1f90e757cde..eaef8e2c1905e 100644
--- a/vllm/lora/ops/triton_ops/lora_expand_op.py
+++ b/vllm/lora/ops/triton_ops/lora_expand_op.py
@@ -13,6 +13,7 @@ import triton.language as tl
 
 from vllm.lora.ops.triton_ops.kernel_utils import do_expand_kernel
 from vllm.lora.ops.triton_ops.utils import _get_lora_b_ptr
+from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
 
@@ -283,6 +284,7 @@ try:
         op_func=_lora_expand,
         mutates_args=["output_tensor"],
         fake_impl=_lora_expand_fake,
+        dispatch_key=current_platform.dispatch_key,
     )
     lora_expand = torch.ops.vllm.lora_expand
 
diff --git a/vllm/lora/ops/triton_ops/lora_shrink_op.py b/vllm/lora/ops/triton_ops/lora_shrink_op.py
index 3f9edfc6d655c..d299fa5e8e1a5 100644
--- a/vllm/lora/ops/triton_ops/lora_shrink_op.py
+++ b/vllm/lora/ops/triton_ops/lora_shrink_op.py
@@ -13,6 +13,7 @@ import triton.language as tl
 
 from vllm.lora.ops.triton_ops.kernel_utils import do_shrink_kernel
 from vllm.lora.ops.triton_ops.utils import _get_lora_a_ptr
+from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
 
@@ -237,6 +238,7 @@ try:
         op_func=_lora_shrink,
         mutates_args=["output_tensor"],
         fake_impl=_lora_shrink_fake,
+        dispatch_key=current_platform.dispatch_key,
     )
     lora_shrink = torch.ops.vllm.lora_shrink
 
diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py
index 5857f7fecb5b4..4c50fbd270516 100644
--- a/vllm/lora/ops/triton_ops/utils.py
+++ b/vllm/lora/ops/triton_ops/utils.py
@@ -35,7 +35,9 @@ def _get_lora_a_ptr(lora_a_weights: list[torch.Tensor], device: torch.device):
         lora_strides_d1.append(lora_a_weight.stride(1))
         lora_strides_d2.append(lora_a_weight.stride(2))
     if len(lora_a_weights) > 1:
-        lora_ptr_tensor = torch.tensor(tensor_ptrs, device=device)
+        lora_ptr_tensor = torch.tensor(tensor_ptrs,
+                                       device=device,
+                                       dtype=torch.uint64)
     else:
         lora_ptr_tensor = lora_a_weights[0]
 
@@ -89,8 +91,12 @@ def _get_lora_b_ptr(lora_weights: list[torch.Tensor], offset_start: int,
 
     if len(lora_weights) > 1:
         # note these are device tensors
-        lora_ptr_tensor = torch.tensor(tensor_ptrs, device=device)
-        slice_start_tensor = torch.tensor(slice_offset_lst, device=device)
+        lora_ptr_tensor = torch.tensor(tensor_ptrs,
+                                       device=device,
+                                       dtype=torch.uint64)
+        slice_start_tensor = torch.tensor(slice_offset_lst,
+                                          device=device,
+                                          dtype=torch.uint64)
     else:
         slice_start_tensor = slice_offset_lst[0]
         lora_ptr_tensor = lora_b_weight[0]
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index ff101b664130e..3bf6571a6addf 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -27,6 +27,7 @@ from vllm.config import (ModelConfig, ParallelConfig, VllmConfig,
 from vllm.logger import init_logger
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
+from vllm.platforms import current_platform
 from vllm.utils import FlexibleArgumentParser, PlaceholderModule
 
 if TYPE_CHECKING:
@@ -513,7 +514,9 @@ def deserialize_tensorizer_model(model: nn.Module,
             **tensorizer_args.stream_kwargs) as stream, TensorDeserializer(
                 stream,
                 dtype=tensorizer_config.dtype,
-                device=torch.device("cuda", torch.cuda.current_device()),
+                device=f'xpu:{torch.xpu.current_device()}'
+                if current_platform.is_xpu() else
+                f'cuda:{torch.cuda.current_device()}',
                 **tensorizer_args.deserialization_kwargs) as deserializer:
         deserializer.load_into_module(model)
         end = time.perf_counter()
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index e2871c1064926..9bc2e2c57e996 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -58,6 +58,10 @@ class XPUPlatform(Platform):
     def get_device_name(cls, device_id: int = 0) -> str:
         return torch.xpu.get_device_name(device_id)
 
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"
+
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
         device_props = torch.xpu.get_device_properties(device_id)
@@ -78,6 +82,13 @@ class XPUPlatform(Platform):
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 64
 
+        # FIXME: Temporarily forcing eager mode
+        # remove after t.compile support stabilizes.
+        if (envs.VLLM_USE_V1 and vllm_config.model_config is not None
+                and not vllm_config.model_config.enforce_eager):
+            from vllm.config import CompilationLevel
+            vllm_config.compilation_config.level = CompilationLevel.NO_COMPILATION  # noqa: E501
+
         # Instances created using VllmConfig() typically have model_config as
         # None by default. The modification involves adding a check to prevent
         # potential null exceptions check and update model config.

From 849590a2a71ee8ebd6109b1c6fa242121e952614 Mon Sep 17 00:00:00 2001
From: XiongfeiWei <isaacwxf23@gmail.com>
Date: Tue, 8 Jul 2025 07:44:02 -0700
Subject: [PATCH 164/167] Update torch/xla pin to 20250703 (#20589)

Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
---
 requirements/tpu.txt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index 2b5fd8941647f..a4aee21d2bd9e 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -18,9 +18,9 @@ setuptools==78.1.0
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch==2.8.0.dev20250618
-torchvision==0.23.0.dev20250618
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250618-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250618-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250618-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch==2.9.0.dev20250703
+torchvision==0.24.0.dev20250703
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250703-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250703-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250703-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
 

From dd382e0fe377b839189cd27db93f11a6cfe35250 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 8 Jul 2025 22:47:46 +0800
Subject: [PATCH 165/167] [Model] Implement missing `get_language_model` for
 Keye-VL (#20631)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/keye.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 34cd26b4c062a..3e1c64bb62eab 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -1592,6 +1592,9 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
 
         return modalities
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
 

From c6c22f16d3533c63bec6cb1b8a3a29759bbbb9c2 Mon Sep 17 00:00:00 2001
From: viravera <dairukan@gmail.com>
Date: Tue, 8 Jul 2025 08:07:14 -0700
Subject: [PATCH 166/167] Revert invalid spellchecker fix on deepseek_vl2
 (#20618)

---
 vllm/model_executor/models/deepseek_vl2.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index a9654f5f46023..a222c4cbe9d0d 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -351,11 +351,11 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         embed_std = 1 / torch.sqrt(
             torch.tensor(self.projector_config.n_embed, dtype=torch.float32))
         if self.tile_tag == "2D":
-            # <|view_separator|>, <|\n|>
+            # <|view_seperator|>, <|\n|>
             self.image_newline = nn.Parameter(
                 torch.randn(self.projector_config.n_embed) * embed_std)
             # This is a typo in original implementation
-            self.view_separator = nn.Parameter(
+            self.view_seperator = nn.Parameter(
                 torch.randn(self.projector_config.n_embed) * embed_std)
         else:
             raise ValueError(
@@ -560,13 +560,13 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
             if self.global_view_pos == "head":
                 global_local_features = torch.cat([
                     global_features,
-                    self.view_separator[None, :],
+                    self.view_seperator[None, :],
                     local_features,
                 ])
             else:
                 global_local_features = torch.cat([
                     local_features,
-                    self.view_separator[None, :],
+                    self.view_seperator[None, :],
                     global_features,
                 ])
 

From baba0389f7e810a361fff5229ce20c2d5a2b1fac Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 8 Jul 2025 23:10:11 +0800
Subject: [PATCH 167/167] [CI] Increase the threshold of the MTEB RERANK tests
 (#20615)

Signed-off-by: wang.yuqi <noooop@126.com>
---
 tests/models/language/pooling/mteb_utils.py          | 2 +-
 tests/models/language/pooling/test_baai.py           | 1 -
 tests/models/language/pooling/test_jina.py           | 7 ++-----
 tests/models/language/pooling/test_qwen3_reranker.py | 2 --
 4 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py
index 59336c1f79063..847ea5f623fd2 100644
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -23,7 +23,7 @@ MTEB_EMBED_TOL = 1e-4
 # See #19344
 MTEB_RERANK_TASKS = ["NFCorpus"]
 MTEB_RERANK_LANGS = ["en"]
-MTEB_RERANK_TOL = 1e-3
+MTEB_RERANK_TOL = 2e-3
 
 
 class VllmMtebEncoder(mteb.Encoder):
diff --git a/tests/models/language/pooling/test_baai.py b/tests/models/language/pooling/test_baai.py
index 3990e8ea92c8a..64a8f25220dab 100644
--- a/tests/models/language/pooling/test_baai.py
+++ b/tests/models/language/pooling/test_baai.py
@@ -68,7 +68,6 @@ RERANK_MODELS = [
                     enable_test=False),
     RerankModelInfo("BAAI/bge-reranker-v2-m3",
                     architecture="XLMRobertaForSequenceClassification",
-                    dtype="float32",
                     enable_test=False)
 ]
 
diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py
index 0bc189d82b8a3..9bfe7411e16b6 100644
--- a/tests/models/language/pooling/test_jina.py
+++ b/tests/models/language/pooling/test_jina.py
@@ -18,11 +18,8 @@ EMBEDDING_MODELS = [
 ]
 
 RERANK_MODELS = [
-    RerankModelInfo(
-        "jinaai/jina-reranker-v2-base-multilingual",
-        architecture="XLMRobertaForSequenceClassification",
-        dtype="float32",
-    )
+    RerankModelInfo("jinaai/jina-reranker-v2-base-multilingual",
+                    architecture="XLMRobertaForSequenceClassification")
 ]
 
 
diff --git a/tests/models/language/pooling/test_qwen3_reranker.py b/tests/models/language/pooling/test_qwen3_reranker.py
index b1e8fd6294ca1..9f040639c78b0 100644
--- a/tests/models/language/pooling/test_qwen3_reranker.py
+++ b/tests/models/language/pooling/test_qwen3_reranker.py
@@ -12,11 +12,9 @@ from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
 RERANK_MODELS = [
     RerankModelInfo("Qwen/Qwen3-Reranker-0.6B",
                     architecture="Qwen3ForSequenceClassification",
-                    dtype="float32",
                     enable_test=True),
     RerankModelInfo("Qwen/Qwen3-Reranker-4B",
                     architecture="Qwen3ForSequenceClassification",
-                    dtype="float32",
                     enable_test=False)
 ]