Merge branch 'main' into qiang_ci_samplers_test

2026-06-28 10:57:15 +08:00 · 2025-12-23 23:12:35 -06:00 · 2025-12-23 23:12:35 -06:00 · b62e11ea1e
commit b62e11ea1e
parent c810903fc9 3ce791ac77
1555 changed files with 101697 additions and 38153 deletions
--- a/.buildkite/ci_config.yaml
+++ b/.buildkite/ci_config.yaml
@ -0,0 +1,24 @@
+name: vllm_ci
+job_dirs:
+  - ".buildkite/test_areas"
+  - ".buildkite/image_build"
+run_all_patterns:
+  - "docker/Dockerfile"
+  - "CMakeLists.txt"
+  - "requirements/common.txt"
+  - "requirements/cuda.txt"
+  - "requirements/build.txt"
+  - "requirements/test.txt"
+  - "setup.py"
+  - "csrc/"
+  - "cmake/"
+run_all_exclude_patterns:
+  - "docker/Dockerfile."
+  - "csrc/cpu/"
+  - "csrc/rocm/"
+  - "cmake/hipify.py"
+  - "cmake/cpu_extension.cmake"
+registries: public.ecr.aws/q9t5s3a7
+repositories:
+  main: "vllm-ci-postmerge-repo"
+  premerge: "vllm-ci-test-repo"
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@ -1,46 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import os
-
-template = """<!DOCTYPE html>
-<html>
-    <body>
-    <h1>Links for vLLM</h1/>
-        <a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
-        <a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
-    </body>
-</html>
-"""
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--wheel", help="The wheel path.", required=True)
-args = parser.parse_args()
-
-filename = os.path.basename(args.wheel)
-
-with open("index.html", "w") as f:
-    print(f"Generated index.html for {args.wheel}")
-    # sync the abi tag with .buildkite/scripts/upload-wheels.sh
-    if "x86_64" in filename:
-        x86_wheel = filename
-        arm_wheel = filename.replace("x86_64", "aarch64").replace(
-            "manylinux1", "manylinux2014"
-        )
-    elif "aarch64" in filename:
-        x86_wheel = filename.replace("aarch64", "x86_64").replace(
-            "manylinux2014", "manylinux1"
-        )
-        arm_wheel = filename
-    else:
-        raise ValueError(f"Unsupported wheel: {filename}")
-    # cloudfront requires escaping the '+' character
-    f.write(
-        template.format(
-            x86_wheel=x86_wheel,
-            x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
-            arm_wheel=arm_wheel,
-            arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
-        )
-    )
--- a/.buildkite/image_build/image_build.sh
+++ b/.buildkite/image_build/image_build.sh
@ -0,0 +1,56 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 8 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+BRANCH=$4
+VLLM_USE_PRECOMPILED=$5
+VLLM_MERGE_BASE_COMMIT=$6
+CACHE_FROM=$7
+CACHE_TO=$8
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+
+# docker buildx 
+docker buildx create --name vllm-builder --driver docker-container --use
+docker buildx inspect --bootstrap
+docker buildx ls
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
+  merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
+else
+  merge_base_commit_build_args=""
+fi
+
+# build
+docker buildx build --file docker/Dockerfile \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --build-arg USE_SCCACHE=1 \
+  --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
+  --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
+  --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
+  ${merge_base_commit_build_args} \
+  --cache-from type=registry,ref=${CACHE_FROM},mode=max \
+  --cache-to type=registry,ref=${CACHE_TO},mode=max \
+  --tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
+  $( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
+  --push \
+  --target test \
+  --progress plain .
--- a/.buildkite/image_build/image_build.yaml
+++ b/.buildkite/image_build/image_build.yaml
@ -0,0 +1,57 @@
+group: Abuild
+steps:
+  - label: ":docker: Build image"
+    key: image-build
+    depends_on: []
+    commands:
+    - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
+
+  - label: ":docker: Build CPU image"
+    key: image-build-cpu
+    depends_on: []
+    commands:
+    - .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
+
+  - label: ":docker: Build HPU image"
+    soft_fail: true
+    depends_on: []
+    key: image-build-hpu
+    commands:
+    - .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
+  
+  - label: ":docker: Build CPU arm64 image"
+    key: cpu-arm64-image-build
+    depends_on: []
+    optional: true
+    commands:
+    - .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
--- a/.buildkite/image_build/image_build_cpu.sh
+++ b/.buildkite/image_build/image_build_cpu.sh
@ -0,0 +1,36 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+# build
+docker build --file docker/Dockerfile.cpu \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --build-arg VLLM_CPU_AVX512BF16=true \
+  --build-arg VLLM_CPU_AVX512VNNI=true \
+  --build-arg VLLM_CPU_AMXBF16=true \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+  --target vllm-test \
+  --progress plain .
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
--- a/.buildkite/image_build/image_build_cpu_arm64.sh
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@ -0,0 +1,33 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+# build
+docker build --file docker/Dockerfile.cpu \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+  --target vllm-test \
+  --progress plain .
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
--- a/.buildkite/image_build/image_build_hpu.sh
+++ b/.buildkite/image_build/image_build_hpu.sh
@ -0,0 +1,34 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+# build
+docker build \
+  --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
+  --progress plain \
+  https://github.com/vllm-project/vllm-gaudi.git
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@ -8,3 +8,4 @@ tasks:
    value: 0.80
 limit: 250 # will run on 250 * 14 subjects = 3500 samples
 num_fewshot: 5
+rtol: 0.05
--- a/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
@ -0,0 +1 @@
+Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -9,11 +9,40 @@ pytest -s -v test_lm_eval_correctness.py \
    --tp-size=1
 """

+import os
+from contextlib import contextmanager
+
 import lm_eval
 import numpy as np
 import yaml

-RTOL = 0.08
+DEFAULT_RTOL = 0.08
+
+
+@contextmanager
+def scoped_env_vars(new_env: dict[str, str]):
+    if not new_env:
+        # Fast path: nothing to do
+        yield
+        return
+
+    old_values = {}
+    new_keys = []
+
+    try:
+        for key, value in new_env.items():
+            if key in os.environ:
+                old_values[key] = os.environ[key]
+            else:
+                new_keys.append(key)
+            os.environ[key] = str(value)
+        yield
+    finally:
+        # Restore / clean up
+        for key, value in old_values.items():
+            os.environ[key] = value
+        for key in new_keys:
+            os.environ.pop(key, None)


 def launch_lm_eval(eval_config, tp_size):
@ -32,23 +61,26 @@ def launch_lm_eval(eval_config, tp_size):
        f"trust_remote_code={trust_remote_code},"
        f"max_model_len={max_model_len},"
    )
-    results = lm_eval.simple_evaluate(
-        model=backend,
-        model_args=model_args,
-        tasks=[task["name"] for task in eval_config["tasks"]],
-        num_fewshot=eval_config["num_fewshot"],
-        limit=eval_config["limit"],
-        # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
-        # text models. however, this is regressing measured strict-match for
-        # existing text models in CI, so only apply it for mm, or explicitly set
-        apply_chat_template=eval_config.get(
-            "apply_chat_template", backend == "vllm-vlm"
-        ),
-        fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
-        # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
-        gen_kwargs=eval_config.get("gen_kwargs"),
-        batch_size=batch_size,
-    )
+
+    env_vars = eval_config.get("env_vars", None)
+    with scoped_env_vars(env_vars):
+        results = lm_eval.simple_evaluate(
+            model=backend,
+            model_args=model_args,
+            tasks=[task["name"] for task in eval_config["tasks"]],
+            num_fewshot=eval_config["num_fewshot"],
+            limit=eval_config["limit"],
+            # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
+            # text models. however, this is regressing measured strict-match for
+            # existing text models in CI, so only apply it for mm, or explicitly set
+            apply_chat_template=eval_config.get(
+                "apply_chat_template", backend == "vllm-vlm"
+            ),
+            fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
+            # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
+            gen_kwargs=eval_config.get("gen_kwargs"),
+            batch_size=batch_size,
+        )
    return results


@ -57,6 +89,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):

    results = launch_lm_eval(eval_config, tp_size)

+    rtol = eval_config.get("rtol", DEFAULT_RTOL)
+
    success = True
    for task in eval_config["tasks"]:
        for metric in task["metrics"]:
@ -64,8 +98,9 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
            measured_value = results["results"][task["name"]][metric["name"]]
            print(
                f"{task['name']} | {metric['name']}: "
-                f"ground_truth={ground_truth} | measured={measured_value}"
+                f"ground_truth={ground_truth:.3f} | "
+                f"measured={measured_value:.3f} | rtol={rtol}"
            )
-            success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
+            success = success and np.isclose(ground_truth, measured_value, rtol=rtol)

    assert success
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@ -7,7 +7,7 @@ vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](http

 ## Performance benchmark quick overview

-**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models.
+**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors, Intel® Gaudi® 3 Accelerators and Arm® Neoverse™ with different models.

 **Benchmarking Duration**: about 1hr.

@ -23,7 +23,7 @@ bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh

 Runtime environment variables:

- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
+- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0.
 - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
 - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
 - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
@ -34,8 +34,9 @@ Runtime environment variables:

 See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
 > NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
-For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
->
+> For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
+> For Arm® Neoverse™, use `tests/latency-tests-arm64-cpu.json`, `tests/throughput-tests-arm64-cpu.json`, `tests/serving-tests-arm64-cpu.json` instead.
+
 ### Latency test

 Here is an example of one test inside `latency-tests.json`:
@ -108,6 +109,65 @@ The number of this test is less stable compared to the delay and latency benchma

 WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.

+#### Default Parameters Field
+
+We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
+
+<details>
+<summary> An Example of default parameters field </summary>
+
+```json
+{
+  "defaults": {
+    "qps_list": [
+      "inf"
+    ],
+    "server_environment_variables": {
+      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
+    },
+    "server_parameters": {
+      "tensor_parallel_size": 1,
+      "dtype": "bfloat16",
+      "block_size": 128,
+      "disable_log_stats": "",
+      "load_format": "dummy"
+    },
+    "client_parameters": {
+      "backend": "vllm",
+      "dataset_name": "random",
+      "random-input-len": 128,
+      "random-output-len": 128,
+      "num_prompts": 200,
+      "ignore-eos": ""
+    }
+  },
+  "tests": [
+    {
+      "test_name": "serving_llama3B_tp2_random_128_128",
+      "server_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "tensor_parallel_size": 2,
+      },
+      "client_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+      }
+    },
+    {
+      "test_name": "serving_qwen3_tp4_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-14B",
+        "tensor_parallel_size": 4,
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-14B",
+      }
+    },
+  ]
+}
+```
+
+</details>
+
 ### Visualizing the results

 The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@ -49,7 +49,11 @@ check_cpus() {
    echo "Need at least 1 NUMA to run benchmarking."
    exit 1
  fi
-  declare -g gpu_type="cpu"
+  if [[ "$(uname -m)" == "aarch64" ]] || [[ "$(uname -m)" == "arm64" ]]; then
+    declare -g gpu_type="arm64-cpu"
+  else
+    declare -g gpu_type="cpu"
+  fi
  echo "GPU type is $gpu_type"
 }

@ -110,7 +114,8 @@ json2envs() {
 wait_for_server() {
  # wait for vllm server to start
  # return 1 if vllm server crashes
-  timeout 1200 bash -c '
+  local timeout_val="1200"
+  timeout "$timeout_val" bash -c '
    until curl -X POST localhost:8000/v1/completions; do
      sleep 1
    done' && return 0 || return 1
@ -206,8 +211,8 @@ run_latency_tests() {

    # check if there is enough GPU to run the test
    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ]; then
-      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
+    if [[ "$ON_CPU" == "1" ]]; then
+      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1')
      world_size=$(($tp*$pp))
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@ -275,8 +280,8 @@ run_throughput_tests() {

    # check if there is enough GPU to run the test
    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ]; then
-      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
+    if [[ "$ON_CPU" == "1" ]]; then
+      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1')
      world_size=$(($tp*$pp))
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@ -316,12 +321,44 @@ run_throughput_tests() {
 run_serving_tests() {
  # run serving tests using `vllm bench serve` command
  # $1: a json file specifying serving test cases
+  #
+  # Supported JSON formats:
+  # 1) Plain format: top-level array
+  #    [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+  #
+  # 2) Default parameters field + plain format tests
+  #    {
+  #      "defaults": { ... },
+  #      "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+  #    }

  local serving_test_file
  serving_test_file=$1

  # Iterate over serving tests
-  jq -c '.[]' "$serving_test_file" | while read -r params; do
+  jq -c '
+    if type == "array" then
+      # Plain format: test cases array
+      .[]
+    elif (type == "object" and has("tests")) then
+      # merge the default parameters into each test cases
+      . as $root
+      | ($root.defaults // {}) as $d
+      | ($root.tests // [])[]
+      # default qps / max_concurrency from defaults if missing
+      | .qps_list = (.qps_list // $d.qps_list)
+      | .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
+      # merge envs / params: test overrides defaults
+      | .server_environment_variables =
+          (($d.server_environment_variables // {}) + (.server_environment_variables // {}))
+      | .server_parameters =
+          (($d.server_parameters // {}) + (.server_parameters // {}))
+      | .client_parameters =
+          (($d.client_parameters // {}) + (.client_parameters // {}))
+    else
+      error("Unsupported serving test file format: must be array or object with .tests")
+    end
+  ' "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^serving_ ]]; then
@ -335,28 +372,33 @@ run_serving_tests() {
      continue
    fi

-    # get client and server arguments
+    # get client and server arguments (after merged the default parameters)
    server_params=$(echo "$params" | jq -r '.server_parameters')
    server_envs=$(echo "$params" | jq -r '.server_environment_variables')
    client_params=$(echo "$params" | jq -r '.client_parameters')
+
    server_args=$(json2args "$server_params")
    server_envs=$(json2envs "$server_envs")
    client_args=$(json2args "$client_params")
+
+    # qps_list
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
+
+    # max_concurrency_list (fallback to num_prompts if missing)
    max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
    if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
-        num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
-        max_concurrency_list="[$num_prompts]"
+      num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
+      max_concurrency_list="[$num_prompts]"
    fi
    max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
    echo "Running over max concurrency list $max_concurrency_list"

    # check if there is enough resources to run the test
    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ]; then
-      pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
+    if [[ "$ON_CPU" == "1" ]]; then
+      pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size // 1')
      world_size=$(($tp*$pp))
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@ -458,9 +500,9 @@ run_serving_tests() {
 main() {
  local ARCH
  ARCH=''
-  if [ "$ON_CPU" == "1" ];then
-     check_cpus
-     ARCH='-cpu'
+  if [[ "$ON_CPU" == "1" ]]; then
+    check_cpus
+    ARCH="-$gpu_type"
  else
     check_gpus
     ARCH="$arch_suffix"
--- a/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json
@ -0,0 +1,26 @@
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+            "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+            "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+            "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dtype": "bfloat16",
+            "distributed_executor_backend": "mp",
+            "block_size": 128,
+            "trust_remote_code": "",
+            "disable_log_stats": "",
+            "enforce_eager": "",
+            "max_num_batched_tokens": 2048,
+            "max_num_seqs": 256,
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    }
+]
--- a/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json
@ -0,0 +1,130 @@
+{
+  "defaults": {
+    "qps_list": [
+      "inf"
+    ],
+    "max_concurrency_list": [
+      12,
+      16,
+      24,
+      32,
+      64,
+      128,
+      200
+    ],
+    "server_environment_variables": {
+      "VLLM_RPC_TIMEOUT": 100000,
+      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+      "VLLM_CPU_SGL_KERNEL": 1,
+      "VLLM_CPU_KVCACHE_SPACE": 40
+    },
+    "server_parameters": {
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "tensor_parallel_size": 1,
+      "dtype": "bfloat16",
+      "distributed_executor_backend": "mp",
+      "block_size": 128,
+      "trust_remote_code": "",
+      "disable_log_stats": "",
+      "enforce_eager": "",
+      "max_num_batched_tokens": 2048,
+      "max_num_seqs": 256,
+      "load_format": "dummy"
+    },
+    "client_parameters": {
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "backend": "vllm",
+      "ignore-eos": "",
+      "num_prompts": 200
+    }
+  },
+  "tests": [
+    {
+      "test_name": "serving_llama8B_tp1_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "sharegpt",
+        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "sharegpt",
+        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    }
+  ]
+}
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
@ -1,610 +0,0 @@
-[
-    {
-        "test_name": "serving_llama8B_bf16_tp1_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp4_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp1_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp4_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp1_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp2_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp4_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp1_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp2_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp4_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp1_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp2_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp4_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp1_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp2_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp4_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    }
-]
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@ -1,276 +1,246 @@
-[
-    {
-        "test_name": "serving_llama8B_tp1_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 32
-        }
+{
+  "defaults": {
+    "qps_list": [
+      "inf"
+    ],
+    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+    "server_environment_variables": {
+      "VLLM_RPC_TIMEOUT": 100000,
+      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+      "VLLM_CPU_SGL_KERNEL": 1,
+      "VLLM_CPU_KVCACHE_SPACE": 40
    },
-    {
-        "test_name": "serving_llama8B_tp2_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 32
-        }
+    "server_parameters": {
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "tensor_parallel_size": 1,
+      "dtype": "bfloat16",
+      "distributed_executor_backend": "mp",
+      "block_size": 128,
+      "trust_remote_code": "",
+      "disable_log_stats": "",
+      "enforce_eager": "",
+      "max_num_batched_tokens": 2048,
+      "max_num_seqs": 256,
+      "load_format": "dummy"
    },
-    {
-        "test_name": "serving_llama8B_tp1_random_128_128",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 32
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp2_random_128_128",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 32
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp1_random_128_2048",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 2048,
-	    "ignore-eos": "",
-            "num_prompts": 32
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp2_random_128_2048",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 2048,
-	    "ignore-eos": "",
-            "num_prompts": 32
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp1_random_2048_128",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 2048,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 32
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp2_random_2048_128",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 2048,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 32
-        }
+    "client_parameters": {
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "backend": "vllm",
+      "ignore-eos": "",
+      "num_prompts": 200
    }
-]
+  },
+  "tests": [
+    {
+      "test_name": "serving_llama8B_tp1_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "sharegpt",
+        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "sharegpt",
+        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp4_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp4_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp4_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama3B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_granite2B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "ibm-granite/granite-3.2-2b-instruct",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "ibm-granite/granite-3.2-2b-instruct",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen1.7B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-1.7B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-1.7B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen4B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-4B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-4B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen8B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-8B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-8B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_glm9B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "zai-org/glm-4-9b-hf",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "zai-org/glm-4-9b-hf",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_gemma7B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "google/gemma-7b",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "google/gemma-7b",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    }
+  ]
+}
--- a/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json
@ -0,0 +1,27 @@
+[
+    {
+        "test_name": "throughput_llama8B_tp1",
+        "environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+            "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+            "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+            "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dtype": "bfloat16",
+            "distributed_executor_backend": "mp",
+            "block_size": 128,
+            "trust_remote_code": "",
+            "disable_log_stats": "",
+            "enforce_eager": "",
+            "max_num_batched_tokens": 2048,
+            "max_num_seqs": 256,
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    }
+]
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -8,13 +8,28 @@ steps:
    commands:
      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

+  - label: "Build arm64 wheel - CUDA 13.0"
+    depends_on: ~
+    id: build-wheel-arm64-cuda-13-0
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
+      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04  --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
+    env:
+      DOCKER_BUILDKIT: "1"
+
  # aarch64 build
  - label: "Build arm64 CPU wheel"
    depends_on: ~
@ -25,24 +40,11 @@ steps:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
    env:
      DOCKER_BUILDKIT: "1"

  # x86 + CUDA builds
-  - label: "Build wheel - CUDA 12.8"
-    depends_on: ~
-    id: build-wheel-cuda-12-8
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
-    env:
-      DOCKER_BUILDKIT: "1"
-
  - label: "Build wheel - CUDA 12.9"
    depends_on: ~
    id: build-wheel-cuda-12-9
@ -52,7 +54,7 @@ steps:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
    env:
      DOCKER_BUILDKIT: "1"

@ -65,7 +67,21 @@ steps:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  # x86 CPU wheel build
+  - label: "Build x86 CPU wheel"
+    depends_on: ~
+    id: build-wheel-x86-cpu
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
    env:
      DOCKER_BUILDKIT: "1"

@ -109,7 +125,6 @@ steps:
  - label: "Annotate release workflow"
    depends_on:
      - create-multi-arch-manifest
-      - build-wheel-cuda-12-8
    id: annotate-release-workflow
    agents:
      queue: cpu_queue_postmerge
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@ -23,8 +23,8 @@ To download the wheel (by version):
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .

-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
 \`\`\`

 To download and upload the image:
@ -45,9 +45,10 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker push vllm/vllm-openai:latest-aarch64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64

-docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
-docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
+docker manifest rm vllm/vllm-openai:latest
+docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker manifest push vllm/vllm-openai:latest
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
 \`\`\`
-EOF 
+EOF 
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@ -0,0 +1,408 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# do not complain about line length (for docstring)
+# ruff: noqa: E501
+
+import argparse
+import json
+import sys
+from dataclasses import asdict, dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+from urllib.parse import quote
+
+import regex as re
+
+if not sys.version_info >= (3, 12):
+    raise RuntimeError("This script requires Python 3.12 or higher.")
+
+INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
+<html>
+  <!-- {comment} -->
+  <meta name="pypi:repository-version" content="1.0">
+  <body>
+{items}
+  </body>
+</html>
+"""
+
+
+@dataclass
+class WheelFileInfo:
+    package_name: str
+    version: str
+    build_tag: str | None
+    python_tag: str
+    abi_tag: str
+    platform_tag: str
+    variant: str | None
+    filename: str
+
+
+def parse_from_filename(file: str) -> WheelFileInfo:
+    """
+    Parse wheel file name to extract metadata.
+
+    The format of wheel names:
+        {package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
+    All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
+    Example:
+        vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
+        vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
+        vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
+        vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
+    """
+    wheel_file_re = re.compile(
+        r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
+    )
+    match = wheel_file_re.match(file)
+    if not match:
+        raise ValueError(f"Invalid wheel file name: {file}")
+
+    package_name = match.group("package_name")
+    version = match.group("version")
+    build_tag = match.group("build_tag")
+    python_tag = match.group("python_tag")
+    abi_tag = match.group("abi_tag")
+    platform_tag = match.group("platform_tag")
+
+    # extract variant from version
+    variant = None
+    if "dev" in version:
+        ver_after_dev = version.split("dev")[-1]
+        if "." in ver_after_dev:
+            variant = ver_after_dev.split(".")[-1]
+            version = version.removesuffix("." + variant)
+    else:
+        if "+" in version:
+            version, variant = version.split("+")
+
+    return WheelFileInfo(
+        package_name=package_name,
+        version=version,
+        build_tag=build_tag,
+        python_tag=python_tag,
+        abi_tag=abi_tag,
+        platform_tag=platform_tag,
+        variant=variant,
+        filename=file,
+    )
+
+
+def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
+    """
+    Generate project list HTML content linking to each project & variant sub-directory.
+    """
+    href_tags = []
+    for name in sorted(subdir_names):
+        name = name.strip("/").strip(".")
+        href_tags.append(f'    <a href="{name}/">{name}/</a><br/>')
+    return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
+
+
+def generate_package_index_and_metadata(
+    wheel_files: list[WheelFileInfo],
+    wheel_base_dir: Path,
+    index_base_dir: Path,
+    comment: str = "",
+) -> tuple[str, str]:
+    """
+    Generate package index HTML content for a specific package, linking to actual wheel files.
+    """
+    href_tags = []
+    metadata = []
+    for file in sorted(wheel_files, key=lambda x: x.filename):
+        relative_path = (
+            wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
+        )
+        # handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
+        # NOTE: this is AWS S3 specific behavior!
+        file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
+        href_tags.append(f'    <a href="{file_path_quoted}">{file.filename}</a><br/>')
+        file_meta = asdict(file)
+        file_meta["path"] = file_path_quoted
+        metadata.append(file_meta)
+    index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
+    metadata_str = json.dumps(metadata, indent=2)
+    return index_str, metadata_str
+
+
+def generate_index_and_metadata(
+    whl_files: list[str],
+    wheel_base_dir: Path,
+    index_base_dir: Path,
+    default_variant: str | None = None,
+    alias_to_default: str | None = None,
+    comment: str = "",
+):
+    """
+    Generate index for all wheel files.
+
+    Args:
+        whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
+        wheel_base_dir (Path): Base directory for wheel files.
+        index_base_dir (Path): Base directory to store index files.
+        default_variant (str | None): The default variant name, if any.
+        alias_to_default (str | None): Alias variant name for the default variant, if any.
+        comment (str | None): Optional comment to include in the generated HTML files.
+
+    First, parse all wheel files to extract metadata.
+    We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
+    The index for the default variant (if any) is generated in the root index directory.
+
+    If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
+    is purely a copy of the corresponding variant index, with only the links adjusted.
+    Otherwise, all wheels without variant suffixes are treated as the default variant.
+
+    If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
+    as the default variant index, but the links are adjusted accordingly.
+
+    Index directory structure:
+        index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
+            index.html  # project list, linking to "vllm/" and other packages, and all variant sub-directories
+            vllm/
+                index.html # package index, pointing to actual files in wheel_base_dir (relative path)
+                metadata.json # machine-readable metadata for all wheels in this package
+            cpu/ # cpu variant sub-directory
+                index.html
+                vllm/
+                    index.html
+                    metadata.json
+            cu129/ # cu129 is actually the alias to default variant
+                index.html
+                vllm/
+                    index.html
+                    metadata.json
+            cu130/ # cu130 variant sub-directory
+                index.html
+                vllm/
+                    index.html
+                    metadata.json
+            ...
+
+    metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
+        [
+            {
+                "package_name": "vllm",
+                "version": "0.10.2rc2",
+                "build_tag": null,
+                "python_tag": "cp38",
+                "abi_tag": "abi3",
+                "platform_tag": "manylinux2014_aarch64",
+                "variant": "cu129",
+                "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
+                "path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
+            },
+            ...
+        ]
+    """
+
+    parsed_files = [parse_from_filename(f) for f in whl_files]
+
+    if not parsed_files:
+        print("No wheel files found, skipping index generation.")
+        return
+
+    # Group by variant
+    variant_to_files: dict[str, list[WheelFileInfo]] = {}
+    for file in parsed_files:
+        variant = file.variant or "default"
+        if variant not in variant_to_files:
+            variant_to_files[variant] = []
+        variant_to_files[variant].append(file)
+
+    print(f"Found variants: {list(variant_to_files.keys())}")
+
+    # sanity check for default variant
+    if default_variant:
+        if "default" in variant_to_files:
+            raise ValueError(
+                "All wheel files must have variant suffixes when `default_variant` is specified."
+            )
+        if default_variant not in variant_to_files:
+            raise ValueError(
+                f"Default variant '{default_variant}' not found among wheel files."
+            )
+
+    if alias_to_default:
+        if "default" not in variant_to_files:
+            # e.g. only some wheels are uploaded to S3 currently
+            print(
+                "[WARN] Alias to default variant specified, but no default variant found."
+            )
+        elif alias_to_default in variant_to_files:
+            raise ValueError(
+                f"Alias variant name '{alias_to_default}' already exists among wheel files."
+            )
+        else:
+            variant_to_files[alias_to_default] = variant_to_files["default"].copy()
+            print(f"Alias variant '{alias_to_default}' created for default variant.")
+
+    # Generate comment in HTML header
+    comment_str = f" ({comment})" if comment else ""
+    comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}"
+
+    # Generate index for each variant
+    subdir_names = set()
+    for variant, files in variant_to_files.items():
+        if variant == "default":
+            variant_dir = index_base_dir
+        else:
+            variant_dir = index_base_dir / variant
+            subdir_names.add(variant)
+
+        variant_dir.mkdir(parents=True, exist_ok=True)
+
+        # gather all package names in this variant
+        packages = set(f.package_name for f in files)
+        if variant == "default":
+            # these packages should also appear in the "project list"
+            # generate after all variants are processed
+            subdir_names = subdir_names.union(packages)
+        else:
+            # generate project list for this variant directly
+            project_list_str = generate_project_list(sorted(packages), comment_tmpl)
+            with open(variant_dir / "index.html", "w") as f:
+                f.write(project_list_str)
+
+        for package in packages:
+            # filter files belonging to this package only
+            package_files = [f for f in files if f.package_name == package]
+            package_dir = variant_dir / package
+            package_dir.mkdir(parents=True, exist_ok=True)
+            index_str, metadata_str = generate_package_index_and_metadata(
+                package_files, wheel_base_dir, package_dir, comment
+            )
+            with open(package_dir / "index.html", "w") as f:
+                f.write(index_str)
+            with open(package_dir / "metadata.json", "w") as f:
+                f.write(metadata_str)
+
+    # Generate top-level project list index
+    project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl)
+    with open(index_base_dir / "index.html", "w") as f:
+        f.write(project_list_str)
+
+
+if __name__ == "__main__":
+    """
+    Arguments:
+        --version <version> : version string for the current build (e.g., commit hash)
+        --wheel-dir <wheel_directory> : directory containing wheel files (default to be same as `version`)
+        --current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
+        --output-dir <output_directory> : directory to store generated index files
+        --alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
+        --comment <comment_string> : (optional) comment string to include in generated HTML files
+    """
+
+    parser = argparse.ArgumentParser(
+        description="Process nightly build wheel files to generate indices."
+    )
+    parser.add_argument(
+        "--version",
+        type=str,
+        required=True,
+        help="Version string for the current build (e.g., commit hash)",
+    )
+    parser.add_argument(
+        "--current-objects",
+        type=str,
+        required=True,
+        help="Path to JSON file containing current S3 objects listing in this version directory",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        required=True,
+        help="Directory to store generated index files",
+    )
+    parser.add_argument(
+        "--wheel-dir",
+        type=str,
+        default=None,
+        help="Directory containing wheel files (default to be same as `version`)",
+    )
+    parser.add_argument(
+        "--alias-to-default",
+        type=str,
+        default=None,
+        help="Alias variant name for the default variant",
+    )
+    parser.add_argument(
+        "--comment",
+        type=str,
+        default="",
+        help="Optional comment string to include in generated HTML files",
+    )
+
+    args = parser.parse_args()
+
+    version = args.version
+    if "/" in version or "\\" in version:
+        raise ValueError("Version string must not contain slashes.")
+    current_objects_path = Path(args.current_objects)
+    output_dir = Path(args.output_dir)
+    if not output_dir.exists():
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Read current objects JSON
+    with open(current_objects_path) as f:
+        current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
+
+    # current_objects looks like from list_objects_v2 S3 API:
+    """
+    "Contents": [
+        {
+            "Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
+            "LastModified": "2025-11-28T14:00:32+00:00",
+            "ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
+            "ChecksumAlgorithm": [
+                "CRC64NVME"
+            ],
+            "ChecksumType": "FULL_OBJECT",
+            "Size": 435649349,
+            "StorageClass": "STANDARD"
+        },
+        ...
+    ]
+    """
+
+    # Extract wheel file keys
+    wheel_files = []
+    for item in current_objects.get("Contents", []):
+        key: str = item["Key"]
+        if key.endswith(".whl"):
+            wheel_files.append(key.split("/")[-1])  # only the filename is used
+
+    print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
+
+    # keep only "official" files for a non-nightly version (specified by cli args)
+    PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
+    if PY_VERSION_RE.match(version):
+        # upload-wheels.sh ensures no "dev" is in args.version
+        wheel_files = list(
+            filter(lambda x: version in x and "dev" not in x, wheel_files)
+        )
+        print(f"Non-nightly version detected, wheel files used: {wheel_files}")
+    else:
+        print("Nightly version detected, keeping all wheel files.")
+
+    # Generate index and metadata, assuming wheels and indices are stored as:
+    # s3://vllm-wheels/{wheel_dir}/<wheel files>
+    # s3://vllm-wheels/<anything>/<index files>
+    wheel_dir = args.wheel_dir or version
+    wheel_base_dir = Path(output_dir).parent / wheel_dir.strip().rstrip("/")
+    index_base_dir = Path(output_dir)
+
+    generate_index_and_metadata(
+        whl_files=wheel_files,
+        wheel_base_dir=wheel_base_dir,
+        index_base_dir=index_base_dir,
+        default_variant=None,
+        alias_to_default=args.alias_to_default,
+        comment=args.comment.strip(),
+    )
+    print(f"Successfully generated index and metadata in {output_dir}")
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -141,7 +141,6 @@ if [[ $commands == *" entrypoints/openai "* ]]; then
  --ignore=entrypoints/openai/test_audio.py \
  --ignore=entrypoints/openai/test_shutdown.py \
  --ignore=entrypoints/openai/test_completion.py \
-  --ignore=entrypoints/openai/test_sleep.py \
  --ignore=entrypoints/openai/test_models.py \
  --ignore=entrypoints/openai/test_lora_adapters.py \
  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@ -0,0 +1,68 @@
+#!/bin/bash
+
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# allow to bind to different cores
+CORE_RANGE=${CORE_RANGE:-0-16}
+OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
+
+export CMAKE_BUILD_PARALLEL_LEVEL=16
+
+# Setup cleanup
+remove_docker_container() {
+    set -e;
+    docker rm -f cpu-test || true;
+}
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Try building the docker image
+docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu .
+
+# Run the image
+docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test
+
+function cpu_tests() {
+  set -e
+
+  docker exec cpu-test bash -c "
+    set -e
+    pip list"
+
+  # offline inference
+  docker exec cpu-test bash -c "
+    set -e
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+
+  # Run model tests
+  docker exec cpu-test bash -c "
+    set -e
+    pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
+
+  # Run kernel tests
+  docker exec cpu-test bash -c "
+    set -e
+    pytest -x -v -s tests/kernels/test_onednn.py
+    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
+    pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"
+
+  # basic online serving
+  docker exec cpu-test bash -c '
+    set -e
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 &
+    server_pid=$!
+    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+    vllm bench serve \
+      --backend vllm \
+      --dataset-name random \
+      --model Qwen/Qwen3-0.6B \
+      --num-prompts 20 \
+      --endpoint /v1/completions
+    kill -s SIGTERM $server_pid &'
+}
+
+# All of CPU tests are expected to be finished less than 40 mins.
+export -f cpu_tests
+timeout 2h bash -c cpu_tests
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@ -25,20 +25,22 @@ function cpu_tests() {

  # offline inference
  podman exec -it "$container_id" bash -c "
+    export TORCH_COMPILE_DISABLE=1
    set -xve
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log

  # Run basic model test
  podman exec -it "$container_id" bash -c "
+    export TORCH_COMPILE_DISABLE=1
    set -evx
    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
-    pip install sentence-transformers datamodel_code_generator
+    pip install sentence-transformers datamodel_code_generator tblib 

    # Note: disable Bart until supports V1
    # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
-    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
-    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
-    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-openai-community/gpt2]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-facebook/opt-125m]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
    # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -21,8 +21,8 @@ trap remove_docker_container EXIT
 remove_docker_container

 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .

 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
@ -50,6 +50,7 @@ function cpu_tests() {
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
+    pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
    pytest -x -v -s tests/kernels/test_onednn.py"

  # Run basic model test
--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@ -74,6 +74,7 @@ FROM ${BASE_IMAGE_NAME}

 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
+ENV SOC_VERSION="ascend910b1"

 RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
    pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -35,10 +35,11 @@ docker run \
    echo $ZE_AFFINITY_MASK
    pip install tblib==3.1.0
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
-    VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
    cd tests
    pytest -v -s v1/core
    pytest -v -s v1/engine
@ -46,6 +47,6 @@ docker run \
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
    pytest -v -s v1/structured_output
    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
-    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
+    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
    pytest -v -s v1/test_serial_utils.py
 '
--- a/.buildkite/scripts/run-prime-rl-test.sh
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@ -12,6 +12,11 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
 PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
 PRIME_RL_DIR="${REPO_ROOT}/prime-rl"

+if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
+    echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
+    exit 0
+fi
+
 echo "Setting up Prime-RL integration test environment..."

 # Clean up any existing Prime-RL directory
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@ -17,7 +17,17 @@ wait_for_server() {
 }

 MODEL="deepseek-ai/DeepSeek-V2-lite"
-BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+  # ROCm platform
+  BACKENDS=("allgather_reducescatter")
+  # Disable MOE padding for ROCm since it is causing eplb to fail
+  export VLLM_ROCM_MOE_PADDING=0
+else
+  # Non-ROCm platform (CUDA/other)
+  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi

 cleanup() {
  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
--- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] [DATA_PARALLEL_SIZE] [TENSOR_PARALLEL_SIZE]
+THRESHOLD=${1:-0.8}
+NUM_Q=${2:-1319}
+PORT=${3:-8020}
+DATA_PARALLEL_SIZE=${4:-2}
+TENSOR_PARALLEL_SIZE=${5:-2}
+OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
+mkdir -p "${OUT_DIR}"
+
+wait_for_server() {
+  local port=$1
+  timeout 600 bash -c '
+    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
+      sleep 1
+    done'
+}
+
+MODEL="QWen/Qwen3-30B-A3B-FP8"
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+  # ROCm platform
+  BACKENDS=("allgather_reducescatter")
+  # Disable MOE padding for ROCm since it is causing eplb to fail
+  export VLLM_ROCM_MOE_PADDING=0
+else
+  # Non-ROCm platform (CUDA/other)
+  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi
+
+cleanup() {
+  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
+    kill "${SERVER_PID}" 2>/dev/null || true
+    for _ in {1..20}; do
+      kill -0 "${SERVER_PID}" 2>/dev/null || break
+      sleep 0.5
+    done
+    kill -9 "${SERVER_PID}" 2>/dev/null || true
+  fi
+}
+trap cleanup EXIT
+
+for BACK in "${BACKENDS[@]}"; do
+  VLLM_DEEP_GEMM_WARMUP=skip \
+  vllm serve "$MODEL" \
+    --enforce-eager \
+    --enable-eplb \
+    --all2all-backend $BACK \
+    --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
+    --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
+    --data-parallel-size ${DATA_PARALLEL_SIZE} \
+    --enable-expert-parallel \
+    --trust-remote-code \
+    --max-model-len 2048 \
+    --port $PORT &
+  SERVER_PID=$!
+  wait_for_server $PORT
+
+  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
+  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+  python3 - <<PY
+import json; acc=json.load(open('${OUT}'))['accuracy']
+print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
+assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
+PY
+
+  cleanup
+  SERVER_PID=
+  sleep 1
+  PORT=$((PORT+1))
+done
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@ -2,9 +2,9 @@
 set -euxo pipefail

 # args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
-THRESHOLD=${1:-0.8}
+THRESHOLD=${1:-0.25}
 NUM_Q=${2:-1319}
-PORT=${3:-8020}
+PORT=${3:-8040}
 OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
 mkdir -p "${OUT_DIR}"

@ -16,8 +16,18 @@ wait_for_server() {
    done'
 }

-MODEL="QWen/Qwen3-30B-A3B-FP8"
-BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
+
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+  # ROCm platform
+  BACKENDS=("allgather_reducescatter")
+  # Disable MOE padding for ROCm since it is causing eplb to fail
+  export VLLM_ROCM_MOE_PADDING=0
+else
+  # Non-ROCm platform (CUDA/other)
+  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi

 cleanup() {
  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
@ -33,14 +43,17 @@ trap cleanup EXIT

 for BACK in "${BACKENDS[@]}"; do
  VLLM_DEEP_GEMM_WARMUP=skip \
-  VLLM_ALL2ALL_BACKEND=$BACK \
  vllm serve "$MODEL" \
    --enforce-eager \
-    --tensor-parallel-size 2 \
-    --data-parallel-size 2 \
+    --tensor-parallel-size 4 \
    --enable-expert-parallel \
+    --enable-eplb \
+    --all2all-backend $BACK \
+    --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
+    --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
    --trust-remote-code \
    --max-model-len 2048 \
+    --gpu-memory-utilization 0.9 \
    --port $PORT &
  SERVER_PID=$!
  wait_for_server $PORT
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@ -2,6 +2,28 @@

 set -ex

+# ======== part 0: setup ========
+
+BUCKET="vllm-wheels"
+INDICES_OUTPUT_DIR="indices"
+DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
+PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
+SUBPATH=$BUILDKITE_COMMIT
+S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
+
+# detect if python3.10+ is available
+has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)")
+if [[ "$has_new_python" -eq 0 ]]; then
+    # use new python from docker
+    docker pull python:3-slim
+    PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
+fi
+
+echo "Using python interpreter: $PYTHON"
+echo "Python version: $($PYTHON --version)"
+
+# ========= part 1: collect, rename & upload the wheel ==========
+
 # Assume wheels are in artifacts/dist/*.whl
 wheel_files=(artifacts/dist/*.whl)

@ -10,74 +32,77 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
  echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
  exit 1
 fi
-
-# Get the single wheel file
 wheel="${wheel_files[0]}"

-# Detect architecture and rename 'linux' to appropriate manylinux version
-arch=$(uname -m)
-if [[ $arch == "x86_64" ]]; then
-    manylinux_version="manylinux1"
-elif [[ $arch == "aarch64" ]]; then
-    manylinux_version="manylinux2014"
-else
-    echo "Warning: Unknown architecture $arch, using manylinux1 as default"
-    manylinux_version="manylinux1"
-fi
+# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31
+# we also accept params as manylinux tag
+# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
+manylinux_version="${1:-manylinux_2_31}"

 # Rename 'linux' to the appropriate manylinux version in the wheel filename
+if [[ "$wheel" != *"linux"* ]]; then
+  echo "Error: Wheel filename does not contain 'linux': $wheel"
+  exit 1
+fi
 new_wheel="${wheel/linux/$manylinux_version}"
 mv -- "$wheel" "$new_wheel"
 wheel="$new_wheel"
+echo "Renamed wheel to: $wheel"

 # Extract the version from the wheel
 version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
-echo "Version: $version"
+echo "Version in wheel: $version"
+pure_version="${version%%+*}"
+echo "Pure version (without variant): $pure_version"

-normal_wheel="$wheel" # Save the original wheel filename
+# copy wheel to its own bucket
+aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"

-# If the version contains "dev", rename it to v1.0.0.dev for consistency
-if [[ $version == *dev* ]]; then
-    suffix="${version##*.}"
-    if [[ $suffix == cu* ]]; then
-        new_version="1.0.0.dev+${suffix}"
-    else
-        new_version="1.0.0.dev"
-    fi
-    new_wheel="${wheel/$version/$new_version}"
-    # use cp to keep both files in the artifacts directory
-    cp -- "$wheel" "$new_wheel"
-    wheel="$new_wheel"
-    version="$new_version"
-fi
+# ========= part 2: generate and upload indices ==========
+# generate indices for all existing wheels in the commit directory
+# this script might be run multiple times if there are multiple variants being built
+# so we need to guarantee there is little chance for "TOCTOU" issues
+# i.e., one process is generating indices while another is uploading a new wheel
+# so we need to ensure no time-consuming operations happen below

-# Upload the wheel to S3
-python3 .buildkite/generate_index.py --wheel "$normal_wheel"
+# list all wheels in the commit directory
+echo "Existing wheels on S3:"
+aws s3 ls "$S3_COMMIT_PREFIX"
+obj_json="objects.json"
+aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
+mkdir -p "$INDICES_OUTPUT_DIR"

-# generate index for this commit
-aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-
-if [[ $normal_wheel == *"cu129"* ]]; then
-    # only upload index.html for cu129 wheels (default wheels) as it
-    # is available on both x86 and arm64
-    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
-    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
+# call script to generate indicies for all existing wheels
+# this indices have relative paths that could work as long as it is next to the wheel directory in s3
+# i.e., the wheels are always in s3://vllm-wheels/<commit>/
+# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
+if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
+    alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
 else
-    echo "Skipping index files for non-cu129 wheels"
+    alias_arg=""
 fi

-# generate index for nightly
-aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
+# HACK: we do not need regex module here, but it is required by pre-commit hook
+# To avoid any external dependency, we simply replace it back to the stdlib re module
+sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg

-if [[ $normal_wheel == *"cu129"* ]]; then
-    # only upload index.html for cu129 wheels (default wheels) as it
-    # is available on both x86 and arm64
-    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
-else
-    echo "Skipping index files for non-cu129 wheels"
+# copy indices to /<commit>/ unconditionally
+echo "Uploading indices to $S3_COMMIT_PREFIX"
+aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
+
+# copy to /nightly/ only if it is on the main branch and not a PR 
+if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
+    echo "Uploading indices to overwrite /nightly/"
+    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
 fi

-aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
-aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
+# re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
+if [[ "$version" != *"dev"* ]]; then
+    echo "Re-generating indices for /$pure_version/"
+    rm -rf "$INDICES_OUTPUT_DIR/*"
+    mkdir -p "$INDICES_OUTPUT_DIR"
+    # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
+    $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
+    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
+fi
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@ -39,9 +39,9 @@ steps:
  # if this test fails, it means the nightly torch version is not compatible with some
  # of the dependencies. Please check the error message and add the package to whitelist
  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
  agent_pool: mi325_1
-  # grade: Blocking
+  grade: Blocking
  soft_fail: true
  source_file_dependencies:
  - requirements/nightly_torch_test.txt
@ -50,9 +50,9 @@ steps:

 - label: Async Engine, Inputs, Utils, Worker Test # 10min
  timeout_in_minutes: 15
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
  agent_pool: mi325_1
-  # grade: Blocking
+  grade: Blocking
  source_file_dependencies:
  - vllm/
  - tests/multimodal
@ -61,25 +61,31 @@ steps:
  - pytest -v -s -m 'not cpu_test' multimodal
  - pytest -v -s utils_

- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
-  timeout_in_minutes: 10
-  mirror_hardwares: [amdexperimental, amdproduction]
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
+  timeout_in_minutes: 30
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
  agent_pool: mi325_1
-  # grade: Blocking
+  grade: Blocking
  source_file_dependencies:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
  - tests/standalone_tests/lazy_imports.py
+  - tests/tokenizers_
+  - tests/tool_parsers
  - tests/transformers_utils
+  - tests/config
  no_gpu: true
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s tokenizers_
+  - pytest -v -s tool_parsers
  - pytest -v -s transformers_utils
+  - pytest -v -s config

 - label: Python-only Installation Test # 10min
  timeout_in_minutes: 20
@ -111,9 +117,9 @@ steps:
  - pytest -v -s basic_correctness/test_cpu_offload.py

 - label: Entrypoints Unit Tests # 5min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
  agent_pool: mi325_1
-  # grade: Blocking
+  grade: Blocking
  timeout_in_minutes: 10
  working_dir: "/vllm-workspace/tests"
  fast_check: true
@ -122,7 +128,7 @@ steps:
  - tests/entrypoints/
  commands:
  - pytest -v -s entrypoints/openai/tool_parsers
-  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling

 - label: Entrypoints Integration Test (LLM) # 30min
  timeout_in_minutes: 40
@ -142,7 +148,7 @@ steps:
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

- label: Entrypoints Integration Test (API Server) # 100min
+- label: Entrypoints Integration Test (API Server 1) # 100min
  timeout_in_minutes: 130
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
@ -156,10 +162,31 @@ steps:
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/test_vision_embeds.py
+  # Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
+  # TODO: Remove after next torch update
+  - VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s entrypoints/openai/test_vision_embeds.py
  - pytest -v -s entrypoints/test_chat_utils.py

+- label: Entrypoints Integration Test (API Server 2)
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/sleep
+  - tests/entrypoints/rpc
+  - tests/tool_use
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/sleep
+  - pytest -v -s tool_use
+  - PYTHONPATH=/vllm-workspace  pytest -v -s entrypoints/rpc
+
 - label: Entrypoints Integration Test (Pooling)
  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
@ -210,6 +237,7 @@ steps:
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
@ -248,9 +276,9 @@ steps:
  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep

 - label: EPLB Algorithm Test # 5min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
  agent_pool: mi325_1
-  # grade: Blocking
+  grade: Blocking
  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
@ -306,28 +334,27 @@ steps:
  - pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional

- label: Engine Test # 25min
-  timeout_in_minutes: 40
+- label: Engine Test # 9min
+  timeout_in_minutes: 15
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - vllm/
  - tests/engine
-  - tests/tokenization
  - tests/test_sequence
  - tests/test_config
  - tests/test_logger
  - tests/test_vllm_port
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
-  # OOM in the CI unless we run this separately
-  - pytest -v -s tokenization

- label: V1 Test e2e + engine # 30min
-  timeout_in_minutes: 45
+- label: V1 Test e2e + engine # 65min
+  timeout_in_minutes: 90
  mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_1
+  # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability.
+  # See discussion here: https://github.com/vllm-project/vllm/pull/31040
+  agent_pool: mi325_8
  # grade: Blocking
  source_file_dependencies:
    - vllm/
@ -340,9 +367,9 @@ steps:

 - label: V1 Test entrypoints # 35min
  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
  agent_pool: mi325_1
-  # grade: Blocking
+  grade: Blocking
  source_file_dependencies:
    - vllm/
    - tests/v1
@ -390,10 +417,34 @@ steps:
  commands:
    - pytest -v -s v1/attention

- label: V1 Test others (CPU) # 5 mins
-  mirror_hardwares: [amdexperimental, amdproduction]
+- label: Batch Invariance Tests (H100) # 10min
+  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
-  # grade: Blocking
+  timeout_in_minutes: 25
+  gpu: h100
+  source_file_dependencies:
+    - vllm/v1/attention
+    - vllm/model_executor/layers
+    - tests/v1/determinism/
+  commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pip install pytest-timeout pytest-forked
+    - pytest -v -s v1/determinism/test_batch_invariance.py
+    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+
+- label: V1 Test attention (B200) # 10min
+  timeout_in_minutes: 30
+  gpu: b200
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
+
+- label: V1 Test others (CPU) # 5 mins
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  agent_pool: mi325_1
+  grade: Blocking
  source_file_dependencies:
    - vllm/
    - tests/v1
@ -409,29 +460,34 @@ steps:

 - label: Examples Test # 30min
  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  working_dir: "/vllm-workspace/examples"
  source_file_dependencies:
  - vllm/entrypoints
+  - vllm/multimodal
  - examples/
  commands:
    - pip install tensorizer # for tensorizer test
+    # for basic
+    - python3 offline_inference/basic/chat.py
    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/chat.py
-    - python3 offline_inference/prefix_caching.py
-    - python3 offline_inference/llm_engine_example.py
-    - python3 offline_inference/audio_language.py --seed 0
-    - python3 offline_inference/vision_language.py --seed 0
-    - python3 offline_inference/vision_language_pooling.py --seed 0
-    - python3 offline_inference/vision_language_multi_image.py --seed 0
-    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
+    # for multi-modal models
+    - python3 offline_inference/audio_language.py --seed 0
+    - python3 offline_inference/vision_language.py --seed 0
+    - python3 offline_inference/vision_language_multi_image.py --seed 0
+    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+    # for pooling models
+    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    # for features demo
+    - python3 offline_inference/prefix_caching.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@ -485,7 +541,7 @@ steps:

 - label: PyTorch Compilation Unit Tests # 15min
  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  torch_nightly: true
@ -502,7 +558,7 @@ steps:

 - label: PyTorch Fullgraph Smoke Test # 15min
  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  torch_nightly: true
@ -529,7 +585,7 @@ steps:
  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
    # Limit to no custom ops to reduce running time
    # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
+  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"

 - label: Cudagraph test
  timeout_in_minutes: 20
@ -558,7 +614,7 @@ steps:

 - label: Kernels Attention Test %N # 23min
  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_8
  # grade: Blocking
  source_file_dependencies:
@ -585,7 +641,7 @@ steps:

 - label: Kernels MoE Test %N # 40min
  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_8
  # grade: Blocking
  source_file_dependencies:
@ -612,6 +668,26 @@ steps:
  commands:
    - pytest -v -s kernels/mamba

+- label: Kernels DeepGEMM Test (H100) # Nvidia-centric
+# Not replicating for CUTLAS & CuTe
+  timeout_in_minutes: 45
+  gpu: h100
+  num_gpus: 1
+  source_file_dependencies:
+  - tools/install_deepgemm.sh
+  - vllm/utils/deep_gemm.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization/test_block_fp8.py
+  - tests/kernels/moe/test_deepgemm.py
+  - tests/kernels/moe/test_batched_deepgemm.py
+  - tests/kernels/attention/test_deepgemm_attention.py
+  commands:
+    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+    - pytest -v -s kernels/moe/test_deepgemm.py
+    - pytest -v -s kernels/moe/test_batched_deepgemm.py
+    - pytest -v -s kernels/attention/test_deepgemm_attention.py
+
 - label: Model Executor Test # 23min
  timeout_in_minutes: 35
  torch_nightly: true
@ -669,19 +745,21 @@ steps:
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
  # TODO(jerryzh168): resolve the above comment
-  - uv pip install --system torchao==0.13.0
+  - uv pip install --system torchao==0.14.1
+  - uv pip install --system conch-triton-kernels
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py

- label: LM Eval Small Models # 15min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction]
+- label: LM Eval Small Models # 53min
+  timeout_in_minutes: 75
+  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
+  autorun_on_main: true
  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt

 - label: OpenAI API correctness # 10min
  timeout_in_minutes: 15
@ -692,33 +770,10 @@ steps:
  - csrc/
  - vllm/entrypoints/openai/
  - vllm/model_executor/models/whisper.py
-  commands: # LMEval
+  commands: # LMEval+Transcription WER check
  # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
-  - pytest -s entrypoints/openai/correctness/  --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py
+  - pytest -s entrypoints/openai/correctness/

- label: OpenAI-Compatible Tool Use # 23 min
-  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  fast_check: false
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  commands:
-    - pytest -v -s -m 'not cpu_test' tool_use
-
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  timeout_in_minutes: 10
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  no_gpu: true
-  commands:
-    - pytest -v -s -m 'cpu_test' tool_use

 #####  models test  #####

@ -743,6 +798,7 @@ steps:
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
+  - vllm/transformers_utils/
  - tests/models/test_initialization.py
  commands:
    # Only when vLLM model source is modified - test initialization of a large
@ -888,6 +944,18 @@ steps:
  commands:
    - pytest -v -s models/language/pooling_mteb_test

+- label: Multi-Modal Processor Test (CPU)
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  no_gpu: true
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
 - label: Multi-Modal Processor Test # 44min
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
@ -901,7 +969,7 @@ steps:
    - pytest -v -s models/multimodal/processing

 - label: Multi-Modal Models Test (Standard) # 60min
-  timeout_in_minutes: 80
+  timeout_in_minutes: 100
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
@ -910,13 +978,18 @@ steps:
  - vllm/
  - tests/models/multimodal
  commands:
+    - export MIOPEN_DEBUG_CONV_DIRECT=0
+    - export MIOPEN_DEBUG_CONV_GEMM=0
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pip freeze | grep -E 'torch'
-    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py
+    # Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
+    # TODO: Remove after next torch update
+    - VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work

- label: Multi-Modal Accuracy Eval (Small Models) # 10min
-  timeout_in_minutes: 70
+- label: Multi-Modal Accuracy Eval (Small Models) # 5min
+  timeout_in_minutes: 10
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
@ -926,9 +999,12 @@ steps:
  - vllm/inputs/
  - vllm/v1/core/
  commands:
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+  - export MIOPEN_DEBUG_CONV_DIRECT=0
+  - export MIOPEN_DEBUG_CONV_GEMM=0
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt

- label: Multi-Modal Models Test (Extended) 1
+- label: Multi-Modal Models Test (Extended) 1 # 60min
+  timeout_in_minutes: 120
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
@ -937,10 +1013,13 @@ steps:
  - vllm/
  - tests/models/multimodal
  commands:
+    - export MIOPEN_DEBUG_CONV_DIRECT=0
+    - export MIOPEN_DEBUG_CONV_GEMM=0
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing

- label: Multi-Modal Models Test (Extended) 2
+- label: Multi-Modal Models Test (Extended) 2 #60min
+  timeout_in_minutes: 120
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
@ -949,10 +1028,13 @@ steps:
  - vllm/
  - tests/models/multimodal
  commands:
+    - export MIOPEN_DEBUG_CONV_DIRECT=0
+    - export MIOPEN_DEBUG_CONV_GEMM=0
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'

- label: Multi-Modal Models Test (Extended) 3
+- label: Multi-Modal Models Test (Extended) 3 # 75min
+  timeout_in_minutes: 150
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
@ -961,6 +1043,8 @@ steps:
  - vllm/
  - tests/models/multimodal
  commands:
+    - export MIOPEN_DEBUG_CONV_DIRECT=0
+    - export MIOPEN_DEBUG_CONV_GEMM=0
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'

@ -995,12 +1079,12 @@ steps:
  optional: true
  commands:
    - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py
+    - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
    - pytest -v -s tests/models/test_transformers.py
-    - pytest -v -s tests/models/multimodal/processing/
-    - pytest -v -s tests/models/multimodal/test_mapping.py
+    # - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
    - python3 examples/offline_inference/basic/chat.py
-    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper

@ -1044,8 +1128,9 @@ steps:
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
+    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py

- label: Blackwell Fusion Tests # 30 min
+- label: Blackwell Fusion and Compile Tests # 30 min
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
  gpu: b200
@ -1053,11 +1138,18 @@ steps:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
  - vllm/compilation/
  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusion_attn.py
+  - tests/compile/test_silu_mul_quant_fusion.py
+  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  - tests/compile/fullgraph/test_full_graph.py
  commands:
    - nvidia-smi
    - pytest -v -s tests/compile/test_fusion_attn.py
@ -1066,7 +1158,9 @@ steps:
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
    # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

 - label: Blackwell Fusion E2E Tests # 30 min
  timeout_in_minutes: 40
@ -1084,19 +1178,15 @@ steps:
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
  - tests/compile/distributed/test_fusions_e2e.py
-  - tests/compile/fullgraph/test_full_graph.py
  commands:
    - nvidia-smi
    # Run all e2e fusion tests
    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
-    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

- label: ROCm GPT-OSS Eval
+- label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
-  agent_pool: mi325_1
-  mirror_hardwares: [amdproduction]
+  gpu: b200
  optional: true # run on nightlies
  source_file_dependencies:
  - tests/evals/gpt_oss
@ -1105,7 +1195,7 @@ steps:
  - vllm/v1/attention/backends/flashinfer.py
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58

 - label: Blackwell Quantized MoE Test
  timeout_in_minutes: 60
@ -1132,7 +1222,7 @@ steps:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt

 #####  1 GPU test  #####
 #####  multi gpus test  #####
@ -1172,13 +1262,13 @@ steps:
  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code

 - label: Distributed Tests (2 GPUs) # 68min
  timeout_in_minutes: 90
@ -1205,6 +1295,7 @@ steps:
  - tests/v1/worker/test_worker_memory_snapshot.py
  commands:
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
@ -1240,7 +1331,7 @@ steps:

 - label: Plugin Tests (2 GPUs) # 40min
  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_2
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
@ -1256,7 +1347,9 @@ steps:
  # end platform plugin tests
  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
  - pip install -e ./plugins/prithvi_io_processor_plugin
-  - pytest -v -s plugins_tests/test_io_processor_plugins.py
+  # Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
+  # TODO: Remove after next torch update
+  - VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s plugins_tests/test_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
  # end io_processor plugins test
  # begin stat_logger plugins test
@ -1308,12 +1401,15 @@ steps:
    - pytest -v -s -x lora/test_llama_tp.py
    - pytest -v -s -x lora/test_llm_with_multi_loras.py
    - pytest -v -s -x lora/test_olmoe_tp.py
-    - pytest -v -s -x lora/test_gptoss_tp.py
+
+    # Disabled for now because MXFP4 backend on non-cuda platform
+    # doesn't support LoRA yet
+    #- pytest -v -s -x lora/test_gptoss_tp.py


 - label: Weight Loading Multiple GPU Test  # 33min
  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_2
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
@ -1372,7 +1468,83 @@ steps:
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py

+
 - label: LM Eval Large Models # optional
+  gpu: a100
+  optional: true
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+##### H100 test #####
+- label: LM Eval Large Models (H100) # optional
+  gpu: h100
+  optional: true
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
+
+##### H200 test #####
+- label: Distributed Tests (H200) # optional
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_2
+  # grade: Blocking
+  gpu: h200
+  optional: true
+  working_dir: "/vllm-workspace/"
+  num_gpus: 2
+  commands:
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
+    - pytest -v -s tests/distributed/test_context_parallel.py
+    - HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
+    - pytest -v -s tests/v1/distributed/test_dbo.py
+
+##### B200 test #####
+- label: Distributed Tests (B200) # optional
+  gpu: b200
+  optional: true
+  working_dir: "/vllm-workspace/"
+  num_gpus: 2
+  commands:
+    - pytest -v -s tests/distributed/test_context_parallel.py
+    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+    - pytest -v -s tests/v1/distributed/test_dbo.py
+
+##### E2E Eval Tests #####
+- label: LM Eval Small Models (1 Card) # 15min
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
+
+- label: LM Eval Large Models (4 Card)
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
  # grade: Blocking
@ -1387,50 +1559,29 @@ steps:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

-##### H100 test #####
- label: LM Eval Large Models (H100) # optional
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_4
-  # grade: Blocking
-  gpu: h100
-  optional: true
-  num_gpus: 4
+- label: ROCm LM Eval Large Models (8 Card)
+  mirror_hardwares: [amdproduction]
+  agent_pool: mi325_8
+  num_gpus: 8
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
+
+- label: ROCm GPT-OSS Eval
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  agent_pool: mi325_1
+  mirror_hardwares: [amdexperimental, amdproduction]
+  optional: true # run on nightlies
  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
+  - tests/evals/gpt_oss
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
  commands:
-    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
-    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
-
-##### H200 test #####
- label: Distributed Tests (H200) # optional
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_2
-  # grade: Blocking
-  gpu: h200
-  optional: true
-  working_dir: "/vllm-workspace/"
-  num_gpus: 2
-  commands:
-    - pytest -v -s tests/compile/distributed/test_async_tp.py
-    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
-    - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
-    - pytest -v -s tests/v1/distributed/test_dbo.py
-
-##### B200 test #####
- label: Distributed Tests (B200) # optional
-  gpu: b200
-  optional: true
-  working_dir: "/vllm-workspace/"
-  num_gpus: 2
-  commands:
-    - pytest -v -s tests/distributed/test_context_parallel.py
-    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
-    - pytest -v -s tests/v1/distributed/test_dbo.py
+    - uv pip install --system 'gpt-oss[eval]==0.0.5'
+    - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58

 ##### RL Integration Tests #####
 - label: Prime-RL Integration Test # 15min
@ -1446,9 +1597,8 @@ steps:
  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
    - bash .buildkite/scripts/run-prime-rl-test.sh
-
 - label: DeepSeek V2-Lite Accuracy
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
  # grade: Blocking
  timeout_in_minutes: 60
@ -1459,8 +1609,8 @@ steps:
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010

- label: Qwen3-30B-A3B-FP8-block Accuracy
-  mirror_hardwares: [amdexperimental]
+- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
  # grade: Blocking
  timeout_in_minutes: 60
@ -1469,4 +1619,36 @@ steps:
  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+  timeout_in_minutes: 60
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+
+- label: DeepSeek V2-Lite Async EPLB Accuracy
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
+
+- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -57,14 +57,16 @@ steps:
  - pytest -v -s -m 'not cpu_test' multimodal
  - pytest -v -s utils_

- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
-  timeout_in_minutes: 10
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
+  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
  - tests/standalone_tests/lazy_imports.py
+  - tests/tokenizers_
+  - tests/tool_parsers
  - tests/transformers_utils
  - tests/config
  no_gpu: true
@ -73,6 +75,8 @@ steps:
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s tokenizers_
+  - pytest -v -s tool_parsers
  - pytest -v -s transformers_utils
  - pytest -v -s config

@ -110,7 +114,7 @@ steps:
  - tests/entrypoints/
  commands:
  - pytest -v -s entrypoints/openai/tool_parsers
-  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling

 - label: Entrypoints Integration Test (LLM) # 30min
  timeout_in_minutes: 40
@ -128,7 +132,7 @@ steps:
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

- label: Entrypoints Integration Test (API Server) # 100min
+- label: Entrypoints Integration Test (API Server 1) # 100min
  timeout_in_minutes: 130
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
@ -140,10 +144,26 @@ steps:
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/
  - pytest -v -s entrypoints/test_chat_utils.py

+- label: Entrypoints Integration Test (API Server 2)
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/sleep
+  - tests/entrypoints/rpc
+  - tests/tool_use
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/sleep
+  - PYTHONPATH=/vllm-workspace  pytest -v -s entrypoints/rpc
+  - pytest -v -s tool_use
+
 - label: Entrypoints Integration Test (Pooling)
  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
@ -192,6 +212,7 @@ steps:
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
@ -275,21 +296,18 @@ steps:
  - pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional

- label: Engine Test # 25min
-  timeout_in_minutes: 40
+- label: Engine Test # 9min
+  timeout_in_minutes: 15
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
  - tests/engine
-  - tests/tokenization
  - tests/test_sequence
  - tests/test_config
  - tests/test_logger
  - tests/test_vllm_port
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
-  # OOM in the CI unless we run this separately
-  - pytest -v -s tokenization

 - label: V1 Test e2e + engine # 30min
  timeout_in_minutes: 45
@ -301,7 +319,10 @@ steps:
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
    - pytest -v -s v1/e2e
-    - pytest -v -s v1/engine
+    # Run this test standalone for now;
+    # need to untangle use (implicit) use of spawn/fork across the tests.
+    - pytest -v -s v1/engine/test_preprocess_error_handling.py
+    - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py

 - label: V1 Test entrypoints # 35min
  timeout_in_minutes: 50
@ -346,6 +367,19 @@ steps:
  commands:
    - pytest -v -s v1/attention

+- label: Batch Invariance Tests (H100) # 10min
+  timeout_in_minutes: 25
+  gpu: h100
+  source_file_dependencies:
+    - vllm/v1/attention
+    - vllm/model_executor/layers
+    - tests/v1/determinism/
+  commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pip install pytest-timeout pytest-forked
+    - pytest -v -s v1/determinism/test_batch_invariance.py
+    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+
 - label: V1 Test attention (B200) # 10min
  timeout_in_minutes: 30
  gpu: b200
@ -375,23 +409,28 @@ steps:
  working_dir: "/vllm-workspace/examples"
  source_file_dependencies:
  - vllm/entrypoints
+  - vllm/multimodal
  - examples/
  commands:
    - pip install tensorizer # for tensorizer test
+    # for basic
+    - python3 offline_inference/basic/chat.py
    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/chat.py
-    - python3 offline_inference/prefix_caching.py
-    - python3 offline_inference/llm_engine_example.py
-    - python3 offline_inference/audio_language.py --seed 0
-    - python3 offline_inference/vision_language.py --seed 0
-    - python3 offline_inference/vision_language_pooling.py --seed 0
-    - python3 offline_inference/vision_language_multi_image.py --seed 0
-    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
+    # for multi-modal models
+    - python3 offline_inference/audio_language.py --seed 0
+    - python3 offline_inference/vision_language.py --seed 0
+    - python3 offline_inference/vision_language_multi_image.py --seed 0
+    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+    # for pooling models
+    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    # for features demo
+    - python3 offline_inference/prefix_caching.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@ -450,7 +489,9 @@ steps:
  # tests covered elsewhere.
  # Use `find` to launch multiple instances of pytest so that
  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
-  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
+  # However, find does not normally propagate error codes, so we combine it with xargs
+  # (using -0 for proper path handling)
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"

 - label: PyTorch Fullgraph Smoke Test # 15min
  timeout_in_minutes: 30
@ -464,7 +505,9 @@ steps:
  # as it is a heavy test that is covered in other steps.
  # Use `find` to launch multiple instances of pytest so that
  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
-  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
+  # However, find does not normally propagate error codes, so we combine it with xargs
+  # (using -0 for proper path handling)
+  - "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"

 - label: PyTorch Fullgraph Test # 27min
  timeout_in_minutes: 40
@ -550,6 +593,25 @@ steps:
  commands:
    - pytest -v -s kernels/mamba

+- label: Kernels DeepGEMM Test (H100)
+  timeout_in_minutes: 45
+  gpu: h100
+  num_gpus: 1
+  source_file_dependencies:
+  - tools/install_deepgemm.sh
+  - vllm/utils/deep_gemm.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization/test_block_fp8.py
+  - tests/kernels/moe/test_deepgemm.py
+  - tests/kernels/moe/test_batched_deepgemm.py
+  - tests/kernels/attention/test_deepgemm_attention.py
+  commands:
+    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+    - pytest -v -s kernels/moe/test_deepgemm.py
+    - pytest -v -s kernels/moe/test_batched_deepgemm.py
+    - pytest -v -s kernels/attention/test_deepgemm_attention.py
+
 - label: Model Executor Test # 23min
  timeout_in_minutes: 35
  torch_nightly: true
@ -599,7 +661,8 @@ steps:
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
  # TODO(jerryzh168): resolve the above comment
-  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system conch-triton-kernels
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py

 - label: LM Eval Small Models # 53min
@ -610,7 +673,7 @@ steps:
  - vllm/model_executor/layers/quantization
  autorun_on_main: true
  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt

 - label: OpenAI API correctness # 22min
  timeout_in_minutes: 30
@ -622,25 +685,6 @@ steps:
  commands: # LMEval+Transcription WER check
  - pytest -s entrypoints/openai/correctness/

- label: OpenAI-Compatible Tool Use # 23 min
-  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental]
-  fast_check: false
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  commands:
-    - pytest -v -s -m 'not cpu_test' tool_use
-
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
-  timeout_in_minutes: 10
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  no_gpu: true
-  commands:
-    - pytest -v -s -m 'cpu_test' tool_use
-
 #####  models test  #####

 - label: Basic Models Tests (Initialization)
@ -650,6 +694,7 @@ steps:
  source_file_dependencies:
  - vllm/
  - tests/models/test_initialization.py
+  - tests/models/registry.py
  commands:
    # Run a subset of model initialization tests
    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
@ -660,7 +705,9 @@ steps:
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
+  - vllm/transformers_utils/
  - tests/models/test_initialization.py
+  - tests/models/registry.py
  commands:
    # Only when vLLM model source is modified - test initialization of a large
    # subset of supported models (the complement of the small subset in the above
@ -786,14 +833,24 @@ steps:
  commands:
    - pytest -v -s models/language/pooling_mteb_test

- label: Multi-Modal Processor Test # 44min
+- label: Multi-Modal Processor Test (CPU)
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  no_gpu: true
+  commands:
+    - "pip install git+https://github.com/TIGER-AI-Lab/Mantis.git || echo 'Mantis installation skipped (decord not available on CPU-only environment)'"
+    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Processor Test
  timeout_in_minutes: 60
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing
+    - pytest -v -s models/multimodal/processing/test_tensor_schema.py

 - label: Multi-Modal Models Test (Standard) # 60min
  timeout_in_minutes: 80
@ -870,11 +927,12 @@ steps:
 - label: Transformers Nightly Models Test
  working_dir: "/vllm-workspace/"
  optional: true
+  soft_fail: true
  commands:
    - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)'
+    - pytest -v -s tests/models/test_initialization.py
    - pytest -v -s tests/models/test_transformers.py
-    # - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py
    - python3 examples/offline_inference/basic/chat.py
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
@ -1015,7 +1073,7 @@ steps:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt

 #####  1 GPU test  #####
 #####  multi gpus test  #####
@ -1051,13 +1109,13 @@ steps:
  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code

 - label: Distributed Tests (2 GPUs) # 68min
  timeout_in_minutes: 90
@ -1084,6 +1142,7 @@ steps:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
@ -1173,6 +1232,8 @@ steps:
    # FIXIT: find out which code initialize cuda before running the test
    # before the fix, we need to use spawn to test it
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    # Alot of these tests are on the edge of OOMing
+    - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
    # There is some Tensor Parallelism related processing logic in LoRA that
    # requires multi-GPU testing for validation.
    - pytest -v -s -x lora/test_chatglm3_tp.py
@ -1267,13 +1328,13 @@ steps:
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
-    - pytest -v -s tests/compile/distributed/test_async_tp.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
-    - pytest -v -s tests/distributed/test_sequence_parallel.py
+    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py

 ##### B200 test #####
@ -1291,12 +1352,14 @@ steps:
 - label: Prime-RL Integration Test # 15min
  timeout_in_minutes: 30
  optional: true
+  soft_fail: true
  num_gpus: 2
  working_dir: "/vllm-workspace"
  source_file_dependencies:
  - vllm/
  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
+    - nvidia-smi
    - bash .buildkite/scripts/run-prime-rl-test.sh

 - label: DeepSeek V2-Lite Accuracy
@ -1308,11 +1371,20 @@ steps:
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010

- label: Qwen3-30B-A3B-FP8-block Accuracy
+- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
  timeout_in_minutes: 60
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+  timeout_in_minutes: 60
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
--- a/.buildkite/test_areas/attention.yaml
+++ b/.buildkite/test_areas/attention.yaml
@ -0,0 +1,21 @@
+group: Attention
+depends_on: 
+  - image-build
+steps:
+- label: V1 attention (H100)
+  timeout_in_minutes: 30
+  gpu: h100
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - pytest -v -s v1/attention
+
+- label: V1 attention (B200)
+  timeout_in_minutes: 30
+  gpu: b200
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
--- a/.buildkite/test_areas/basic_correctness.yaml
+++ b/.buildkite/test_areas/basic_correctness.yaml
@ -0,0 +1,16 @@
+group: Basic Correctness
+depends_on: 
+  - image-build
+steps:
+- label: Basic Correctness
+  timeout_in_minutes: 30
+  source_file_dependencies:
+  - vllm/
+  - tests/basic_correctness/test_basic_correctness
+  - tests/basic_correctness/test_cpu_offload
+  - tests/basic_correctness/test_cumem.py
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s basic_correctness/test_cumem.py
+  - pytest -v -s basic_correctness/test_basic_correctness.py
+  - pytest -v -s basic_correctness/test_cpu_offload.py
--- a/.buildkite/test_areas/benchmarks.yaml
+++ b/.buildkite/test_areas/benchmarks.yaml
@ -0,0 +1,19 @@
+group: Benchmarks
+depends_on: 
+  - image-build
+steps:
+- label: Benchmarks
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/.buildkite"
+  source_file_dependencies:
+  - benchmarks/
+  commands:
+  - bash scripts/run-benchmarks.sh
+
+- label: Benchmarks CLI Test
+  timeout_in_minutes: 20
+  source_file_dependencies:
+  - vllm/
+  - tests/benchmarks/
+  commands:
+  - pytest -v -s benchmarks/
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@ -0,0 +1,57 @@
+group: Compile
+depends_on: 
+  - image-build
+steps:
+- label: Fusion and Compile Tests (B200)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusion_attn.py
+  - tests/compile/test_silu_mul_quant_fusion.py
+  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  - tests/compile/fullgraph/test_full_graph.py
+  commands:
+    - nvidia-smi
+    - pytest -v -s tests/compile/test_fusion_attn.py
+    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    # this runner has 2 GPUs available even though num_gpus=2 is not set
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # Wrap with quotes to escape yaml
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+
+- label: Fusion E2E (2 GPUs)(B200)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  commands:
+    - nvidia-smi
+    # Run all e2e fusion tests
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
+
--- a/.buildkite/test_areas/cuda.yaml
+++ b/.buildkite/test_areas/cuda.yaml
@ -0,0 +1,22 @@
+group: CUDA
+depends_on: 
+  - image-build
+steps:
+- label: Platform Tests (CUDA)
+  timeout_in_minutes: 15
+  source_file_dependencies:
+  - vllm/
+  - tests/cuda
+  commands:
+    - pytest -v -s cuda/test_cuda_context.py
+
+- label: Cudagraph
+  timeout_in_minutes: 20
+  source_file_dependencies:
+  - tests/v1/cudagraph
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/config/compilation.py
+  - vllm/compilation
+  commands:
+    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@ -0,0 +1,199 @@
+group: Distributed
+depends_on: 
+  - image-build
+steps:
+- label: Distributed Comm Ops
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/distributed
+  - tests/distributed
+  commands:
+  - pytest -v -s distributed/test_comm_ops.py
+  - pytest -v -s distributed/test_shm_broadcast.py
+  - pytest -v -s distributed/test_shm_buffer.py
+  - pytest -v -s distributed/test_shm_storage.py
+
+- label: Distributed (2 GPUs)
+  timeout_in_minutes: 90
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/compilation/
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/worker/worker_base.py
+  - vllm/v1/engine/
+  - vllm/v1/worker/
+  - tests/compile/fullgraph/test_basic_correctness.py
+  - tests/compile/test_wrapper.py
+  - tests/distributed/
+  - tests/entrypoints/llm/test_collective_rpc.py
+  - tests/v1/distributed
+  - tests/v1/entrypoints/openai/test_multi_api_servers.py
+  - tests/v1/shutdown
+  - tests/v1/worker/test_worker_memory_snapshot.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
+  - pytest -v -s entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
+  - pytest -v -s ./compile/test_wrapper.py
+  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - pytest -v -s distributed/test_sequence_parallel.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
+
+- label: Distributed Tests (4 GPUs)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - tests/distributed/test_utils
+  - tests/distributed/test_pynccl
+  - tests/distributed/test_events
+  - tests/compile/fullgraph/test_basic_correctness.py
+  - examples/offline_inference/rlhf.py
+  - examples/offline_inference/rlhf_colocate.py
+  - tests/examples/offline_inference/data_parallel.py
+  - tests/v1/distributed
+  - tests/v1/engine/test_engine_core_client.py
+  - tests/distributed/test_symm_mem_allreduce.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  # test with torchrun tp=2 and external_dp=2
+  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with torchrun tp=2 and pp=2
+  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with torchrun tp=4 and dp=1
+  - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=2, pp=2 and dp=1
+  - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=1 and dp=4 with ep
+  - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=2 and dp=2 with ep
+  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with internal dp
+  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
+  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
+  - pytest -v -s distributed/test_utils.py
+  - pytest -v -s compile/fullgraph/test_basic_correctness.py
+  - pytest -v -s distributed/test_pynccl.py
+  - pytest -v -s distributed/test_events.py
+  - pytest -v -s distributed/test_symm_mem_allreduce.py
+  # TODO: create a dedicated test section for multi-GPU example tests
+  # when we have multiple distributed example tests
+  - cd ../examples/offline_inference
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+
+- label: Distributed Tests (8 GPUs)(H100)
+  timeout_in_minutes: 10
+  gpu: h100
+  num_gpus: 8
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - examples/offline_inference/torchrun_dp_example.py
+  - vllm/config/parallel.py
+  - vllm/distributed/
+  - vllm/v1/engine/llm_engine.py
+  - vllm/v1/executor/uniproc_executor.py
+  - vllm/v1/worker/gpu_worker.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  # test with torchrun tp=2 and dp=4 with ep
+  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+
+- label: Distributed Tests (4 GPUs)(A100)
+  gpu: a100
+  optional: true
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/
+  commands:
+  # NOTE: don't test llama model here, it seems hf implementation is buggy
+  # see https://github.com/vllm-project/vllm/pull/5689 for details
+  - pytest -v -s distributed/test_custom_all_reduce.py
+  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
+  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+  - pytest -v -s -x lora/test_mixtral.py
+
+- label: Distributed Tests (2 GPUs)(H200)
+  gpu: h200
+  optional: true
+  working_dir: "/vllm-workspace/"
+  num_gpus: 2
+  commands:
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
+    - pytest -v -s tests/distributed/test_context_parallel.py
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
+    - pytest -v -s tests/v1/distributed/test_dbo.py
+
+- label: Distributed Tests (2 GPUs)(B200)
+  gpu: b200
+  optional: true
+  working_dir: "/vllm-workspace/"
+  num_gpus: 2
+  commands:
+    - pytest -v -s tests/distributed/test_context_parallel.py
+    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+    - pytest -v -s tests/v1/distributed/test_dbo.py
+
+- label: 2 Node Test (4 GPUs)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  num_nodes: 2
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
+  - tests/examples/offline_inference/data_parallel.py
+  commands:
+    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
+
+- label: Distributed NixlConnector PD accuracy (4 GPUs)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+
+- label: Pipeline + Context Parallelism (4 GPUs))
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
+  commands:
+  - pytest -v -s distributed/test_pp_cudagraph.py
+  - pytest -v -s distributed/test_pipeline_parallel.py
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@ -0,0 +1,42 @@
+group: E2E Integration
+depends_on: 
+  - image-build
+steps:
+- label: DeepSeek V2-Lite Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+  timeout_in_minutes: 60
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+
+- label: Prime-RL Integration (2 GPUs)
+  timeout_in_minutes: 30
+  optional: true
+  soft_fail: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  source_file_dependencies:
+  - vllm/
+  - .buildkite/scripts/run-prime-rl-test.sh
+  commands:
+    - bash .buildkite/scripts/run-prime-rl-test.sh
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@ -0,0 +1,26 @@
+group: Engine
+depends_on: 
+  - image-build
+steps:
+- label: Engine
+  timeout_in_minutes: 15
+  source_file_dependencies:
+  - vllm/
+  - tests/engine
+  - tests/test_sequence
+  - tests/test_config
+  - tests/test_logger
+  - tests/test_vllm_port
+  commands:
+  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
+
+- label: V1 e2e + engine
+  timeout_in_minutes: 45
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    # TODO: accuracy does not match, whether setting
+    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+    - pytest -v -s v1/e2e
+    - pytest -v -s v1/engine
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@ -0,0 +1,83 @@
+group: Entrypoints
+depends_on: 
+  - image-build
+steps:
+- label: Entrypoints Unit Tests  
+  timeout_in_minutes: 10
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/entrypoints
+  - tests/entrypoints/
+  commands:
+  - pytest -v -s entrypoints/openai/tool_parsers
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+
+- label: Entrypoints Integration (LLM)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/llm
+  - tests/entrypoints/offline_mode
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
+  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+
+- label: Entrypoints Integration (API Server 1)
+  timeout_in_minutes: 130
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/openai
+  - tests/entrypoints/test_chat_utils
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/
+  - pytest -v -s entrypoints/test_chat_utils.py
+
+
+- label: Entrypoints Integration (API Server 2)
+  timeout_in_minutes: 130
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/tool_use
+  - tests/entrypoints/sleep
+  - tests/entrypoints/instrumentator
+  - tests/entrypoints/rpc
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
+  - pytest -v -s entrypoints/instrumentator
+  - pytest -v -s entrypoints/sleep
+  - pytest -v -s tool_use
+
+- label: Entrypoints Integration (Pooling)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/pooling
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/pooling
+
+
+- label: Entrypoints V1
+  timeout_in_minutes: 50
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    - pytest -v -s v1/entrypoints
+
+- label: OpenAI API Correctness
+  timeout_in_minutes: 30
+  source_file_dependencies:
+  - csrc/
+  - vllm/entrypoints/openai/
+  - vllm/model_executor/models/whisper.py
+  commands: # LMEval+Transcription WER check
+  - pytest -s entrypoints/openai/correctness/
--- a/.buildkite/test_areas/expert_parallelism.yaml
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@ -0,0 +1,23 @@
+group: Expert Parallelism
+depends_on: 
+  - image-build
+steps:
+- label: EPLB Algorithm
+  timeout_in_minutes: 15
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/distributed/eplb
+  - tests/distributed/test_eplb_algo.py
+  commands:
+  - pytest -v -s distributed/test_eplb_algo.py
+
+- label: EPLB Execution
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/eplb
+  - tests/distributed/test_eplb_execute.py
+  commands:
+  - pytest -v -s distributed/test_eplb_execute.py
+  - pytest -v -s distributed/test_eplb_spec_decode.py
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@ -0,0 +1,117 @@
+group: Kernels
+depends_on: 
+  - image-build
+steps:
+- label: Kernels Core Operation Test
+  timeout_in_minutes: 75
+  source_file_dependencies:
+  - csrc/
+  - tests/kernels/core
+  - tests/kernels/test_top_k_per_row.py
+  commands:
+    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+
+- label: Kernels Attention Test %N
+  timeout_in_minutes: 35
+  source_file_dependencies:
+  - csrc/attention/
+  - vllm/attention
+  - vllm/v1/attention
+  - tests/kernels/attention
+  commands:
+    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels Quantization Test %N
+  timeout_in_minutes: 90
+  source_file_dependencies:
+  - csrc/quantization/
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization
+  commands:
+    - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels MoE Test %N
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - csrc/quantization/cutlass_w8a8/moe/
+  - csrc/moe/
+  - tests/kernels/moe
+  - vllm/model_executor/layers/fused_moe/
+  - vllm/distributed/device_communicators/
+  - vllm/envs.py
+  - vllm/config
+  commands:
+    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels Mamba Test
+  timeout_in_minutes: 45
+  source_file_dependencies:
+  - csrc/mamba/
+  - tests/kernels/mamba
+  - vllm/model_executor/layers/mamba/ops
+  commands:
+    - pytest -v -s kernels/mamba
+
+- label: Kernels DeepGEMM Test (H100)
+  timeout_in_minutes: 45
+  gpu: h100
+  num_gpus: 1
+  source_file_dependencies:
+  - tools/install_deepgemm.sh
+  - vllm/utils/deep_gemm.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization/test_block_fp8.py
+  - tests/kernels/moe/test_deepgemm.py
+  - tests/kernels/moe/test_batched_deepgemm.py
+  - tests/kernels/attention/test_deepgemm_attention.py
+  commands:
+    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+    - pytest -v -s kernels/moe/test_deepgemm.py
+    - pytest -v -s kernels/moe/test_batched_deepgemm.py
+    - pytest -v -s kernels/attention/test_deepgemm_attention.py
+
+- label: Kernels (B200)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  # optional: true
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - csrc/attention/mla/
+  - csrc/quantization/cutlass_w8a8/moe/
+  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/attention/backends/mla/cutlass_mla.py
+  - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/platforms/cuda.py
+  - vllm/attention/selector.py
+  commands:
+    - nvidia-smi
+    - python3 examples/offline_inference/basic/chat.py
+    # Attention
+    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+    - pytest -v -s tests/kernels/attention/test_attention_selector.py
+    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
+    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
+    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+    - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
+    # Quantization
+    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+    - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
+    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
+    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
+    - pytest -v -s tests/kernels/moe/test_flashinfer.py
+    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@ -0,0 +1,46 @@
+group: LM Eval
+depends_on: 
+  - image-build
+steps:
+- label: LM Eval Small Models
+  timeout_in_minutes: 75
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  autorun_on_main: true
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
+
+- label: LM Eval Large Models (4 GPUs)(A100)
+  gpu: a100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+- label: LM Eval Large Models (4 GPUs)(H100)
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
+- label: LM Eval Small Models (B200)
+  timeout_in_minutes: 120
+  gpu: b200
+  optional: true
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
--- a/.buildkite/test_areas/lora.yaml
+++ b/.buildkite/test_areas/lora.yaml
@ -0,0 +1,33 @@
+group: LoRA
+depends_on: 
+  - image-build
+steps:
+- label: LoRA %N
+  timeout_in_minutes: 30
+  source_file_dependencies:
+  - vllm/lora
+  - tests/lora
+  commands:
+    - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
+  parallelism: 4
+
+
+- label: LoRA TP (Distributed)
+  timeout_in_minutes: 30
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/lora
+  - tests/lora
+  commands:
+    # FIXIT: find out which code initialize cuda before running the test
+    # before the fix, we need to use spawn to test it
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    # Alot of these tests are on the edge of OOMing
+    - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+    # There is some Tensor Parallelism related processing logic in LoRA that
+    # requires multi-GPU testing for validation.
+    - pytest -v -s -x lora/test_chatglm3_tp.py
+    - pytest -v -s -x lora/test_llama_tp.py
+    - pytest -v -s -x lora/test_llm_with_multi_loras.py
+    - pytest -v -s -x lora/test_olmoe_tp.py
+    - pytest -v -s -x lora/test_gptoss_tp.py
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@ -0,0 +1,165 @@
+group: Miscellaneous
+depends_on: 
+  - image-build
+steps:
+- label: V1 Others
+  timeout_in_minutes: 60
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    # split the test to avoid interference
+    - pytest -v -s -m 'not cpu_test' v1/core
+    - pytest -v -s v1/executor
+    - pytest -v -s v1/kv_offload
+    - pytest -v -s v1/sample
+    - pytest -v -s v1/logits_processors
+    - pytest -v -s v1/worker
+    - pytest -v -s v1/spec_decode
+    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+    - pytest -v -s -m 'not cpu_test' v1/metrics
+    - pytest -v -s v1/test_oracle.py
+    - pytest -v -s v1/test_request.py
+    - pytest -v -s v1/test_outputs.py
+    # Integration test for streaming correctness (requires special branch).
+    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
+    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+
+- label: V1 Others (CPU)
+  depends_on: ~
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  no_gpu: true
+  commands:
+    # split the test to avoid interference
+    - pytest -v -s -m 'cpu_test' v1/core
+    - pytest -v -s v1/structured_output
+    - pytest -v -s v1/test_serial_utils.py
+    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
+    - pytest -v -s -m 'cpu_test' v1/metrics
+
+- label: Regression
+  timeout_in_minutes: 20
+  source_file_dependencies:
+  - vllm/
+  - tests/test_regression
+  commands:
+  - pip install modelscope
+  - pytest -v -s test_regression.py
+  working_dir: "/vllm-workspace/tests" # optional
+
+- label: Examples
+  timeout_in_minutes: 45
+  working_dir: "/vllm-workspace/examples"
+  source_file_dependencies:
+  - vllm/entrypoints
+  - vllm/multimodal
+  - examples/
+  commands:
+    - pip install tensorizer # for tensorizer test
+    - python3 offline_inference/basic/chat.py # for basic
+    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 offline_inference/basic/classify.py
+    - python3 offline_inference/basic/embed.py
+    - python3 offline_inference/basic/score.py
+    # for multi-modal models
+    - python3 offline_inference/audio_language.py --seed 0
+    - python3 offline_inference/vision_language.py --seed 0
+    - python3 offline_inference/vision_language_multi_image.py --seed 0
+    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+     # for pooling models
+    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    # for features demo
+    - python3 offline_inference/prefix_caching.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
+    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+
+- label: Metrics, Tracing (2 GPUs)
+  timeout_in_minutes: 20
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/
+  - tests/v1/tracing
+  commands:
+  - "pip install \
+      'opentelemetry-sdk>=1.26.0' \
+      'opentelemetry-api>=1.26.0' \
+      'opentelemetry-exporter-otlp>=1.26.0' \
+      'opentelemetry-semantic-conventions-ai>=0.4.1'"
+  - pytest -v -s v1/tracing
+
+- label: Python-only Installation
+  depends_on: ~
+  timeout_in_minutes: 20
+  source_file_dependencies:
+  - tests/standalone_tests/python_only_compile.sh
+  - setup.py
+  commands:
+  - bash standalone_tests/python_only_compile.sh
+
+- label: Async Engine, Inputs, Utils, Worker
+  timeout_in_minutes: 50
+  source_file_dependencies:
+  - vllm/
+  - tests/multimodal
+  - tests/utils_
+  commands:
+  - pytest -v -s -m 'not cpu_test' multimodal
+  - pytest -v -s utils_
+
+- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
+  depends_on: ~
+  timeout_in_minutes: 30
+  source_file_dependencies:
+  - vllm/
+  - tests/test_inputs.py
+  - tests/test_outputs.py
+  - tests/multimodal
+  - tests/standalone_tests/lazy_imports.py
+  - tests/tokenizers_
+  - tests/tool_parsers
+  - tests/transformers_utils
+  - tests/config
+  no_gpu: true
+  commands:
+  - python3 standalone_tests/lazy_imports.py
+  - pytest -v -s test_inputs.py
+  - pytest -v -s test_outputs.py
+  - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s tokenizers_
+  - pytest -v -s tool_parsers
+  - pytest -v -s transformers_utils
+  - pytest -v -s config
+
+- label: GPT-OSS Eval (B200)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  source_file_dependencies:
+  - tests/evals/gpt_oss
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - uv pip install --system 'gpt-oss[eval]==0.0.5'
+    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
+- label: Batch Invariance (H100)
+  timeout_in_minutes: 25
+  gpu: h100
+  source_file_dependencies:
+    - vllm/v1/attention
+    - vllm/model_executor/layers
+    - tests/v1/determinism/
+  commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pip install pytest-timeout pytest-forked
+    - pytest -v -s v1/determinism/test_batch_invariance.py
+    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
--- a/.buildkite/test_areas/model_executor.yaml
+++ b/.buildkite/test_areas/model_executor.yaml
@ -0,0 +1,17 @@
+group: Model Executor
+depends_on: 
+  - image-build
+steps:
+- label: Model Executor
+  timeout_in_minutes: 35
+  source_file_dependencies:
+  - vllm/engine/arg_utils.py
+  - vllm/config/model.py
+  - vllm/model_executor
+  - tests/model_executor
+  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
+  commands:
+    - apt-get update && apt-get install -y curl libsodium23
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s model_executor
+    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@ -0,0 +1,64 @@
+group: Models - Basic
+depends_on: 
+  - image-build
+steps:
+- label: Basic Models Tests (Initialization)
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/test_initialization.py
+  - tests/models/registry.py
+  commands:
+    # Run a subset of model initialization tests
+    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
+
+- label: Basic Models Tests (Extra Initialization) %N
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/model_executor/models/
+  - tests/models/test_initialization.py
+  - tests/models/registry.py
+  commands:
+    # Only when vLLM model source is modified - test initialization of a large
+    # subset of supported models (the complement of the small subset in the above
+    # test.) Also run if model initialization test file is modified
+    - pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+
+- label: Basic Models Tests (Other)
+  timeout_in_minutes: 45
+  source_file_dependencies:
+  - vllm/
+  - tests/models/test_transformers.py
+  - tests/models/test_registry.py
+  commands:
+    - pytest -v -s models/test_transformers.py models/test_registry.py
+
+- label: Basic Models Test (Other CPU) # 5min
+  timeout_in_minutes: 10
+  source_file_dependencies:
+  - vllm/
+  - tests/models/test_utils.py
+  - tests/models/test_vision.py
+  no_gpu: true
+  commands:
+    - pytest -v -s models/test_utils.py models/test_vision.py
+
+- label: Transformers Nightly Models
+  working_dir: "/vllm-workspace/"
+  optional: true
+  soft_fail: true
+  commands:
+    - pip install --upgrade git+https://github.com/huggingface/transformers
+    - pytest -v -s tests/models/test_initialization.py
+    - pytest -v -s tests/models/test_transformers.py
+    - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/test_mapping.py
+    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    # Whisper needs spawn method to avoid deadlock
+    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
--- a/.buildkite/test_areas/models_distributed.yaml
+++ b/.buildkite/test_areas/models_distributed.yaml
@ -0,0 +1,22 @@
+group: Models - Distributed
+depends_on: 
+  - image-build
+steps:
+- label: Distributed Model Tests (2 GPUs)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/model_executor/model_loader/sharded_state_loader.py
+  - vllm/model_executor/models/
+  - tests/basic_correctness/
+  - tests/model_executor/model_loader/test_sharded_state_loader.py
+  - tests/models/
+  commands:
+  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
+  # Avoid importing model tests that cause CUDA reinitialization error
+  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
--- a/.buildkite/test_areas/models_language.yaml
+++ b/.buildkite/test_areas/models_language.yaml
@ -0,0 +1,91 @@
+group: Models - Language
+depends_on: 
+  - image-build
+steps:
+- label: Language Models Tests (Standard)
+  timeout_in_minutes: 25
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language
+  commands:
+    # Test standard language models, excluding a subset of slow tests
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/language -m 'core_model and (not slow_test)'
+
+- label: Language Models Tests (Extra Standard) %N
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/model_executor/models/
+  - tests/models/language/pooling/test_embedding.py
+  - tests/models/language/generation/test_common.py
+  - tests/models/language/pooling/test_classification.py
+  commands:
+    # Shard slow subset of standard language models tests. Only run when model
+    # source is modified, or when specified test files are modified
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+
+- label: Language Models Tests (Hybrid) %N
+  timeout_in_minutes: 75
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation
+  commands:
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+    # Shard hybrid language model tests
+    - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+
+- label: Language Models Test (Extended Generation) # 80min
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation
+  commands:
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+
+- label: Language Models Test (PPL)
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation_ppl_test
+  commands:
+    - pytest -v -s models/language/generation_ppl_test
+
+- label: Language Models Test (Extended Pooling)  # 36min
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/pooling
+  commands:
+    - pytest -v -s models/language/pooling -m 'not core_model'
+
+- label: Language Models Test (MTEB)
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/pooling_mteb_test
+  commands:
+    - pytest -v -s models/language/pooling_mteb_test
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@ -0,0 +1,79 @@
+group: Models - Multimodal
+depends_on: 
+  - image-build
+steps:
+- label: Multi-Modal Models (Standard) # 60min
+  timeout_in_minutes: 80
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
+
+- label: Multi-Modal Processor Test (CPU)
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  no_gpu: true
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Processor # 44min
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Accuracy Eval (Small Models) # 50min
+  timeout_in_minutes: 70
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - vllm/multimodal/
+  - vllm/inputs/
+  - vllm/v1/core/
+  commands:
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+
+- label: Multi-Modal Models (Extended) 1
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
+
+- label: Multi-Modal Models (Extended) 2
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+
+- label: Multi-Modal Models (Extended) 3
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+
+# This test is used only in PR development phase to test individual models and should never run on main
+- label: Custom Models
+  optional: true
+  commands:
+    - echo 'Testing custom models...'
+    # PR authors can temporarily add commands below to test individual models
+    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
+    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
--- a/.buildkite/test_areas/plugins.yaml
+++ b/.buildkite/test_areas/plugins.yaml
@ -0,0 +1,34 @@
+group: Plugins
+depends_on: 
+  - image-build
+steps:
+- label: Plugin Tests (2 GPUs)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/plugins/
+  - tests/plugins/
+  commands:
+  # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
+  - pip install -e ./plugins/vllm_add_dummy_platform
+  - pytest -v -s plugins_tests/test_platform_plugins.py
+  - pip uninstall vllm_add_dummy_platform -y
+  # end platform plugin tests
+  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+  - pip install -e ./plugins/prithvi_io_processor_plugin
+  - pytest -v -s plugins_tests/test_io_processor_plugins.py
+  - pip uninstall prithvi_io_processor_plugin -y
+  # end io_processor plugins test
+  # begin stat_logger plugins test
+  - pip install -e ./plugins/vllm_add_dummy_stat_logger
+  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
+  - pip uninstall dummy_stat_logger -y
+  # end stat_logger plugins test
+  # other tests continue here:
+  - pytest -v -s plugins_tests/test_scheduler_plugins.py
+  - pip install -e ./plugins/vllm_add_dummy_model
+  - pytest -v -s distributed/test_distributed_oot.py
+  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+  - pytest -v -s models/test_oot_registration.py # it needs a clean process
+  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
--- a/.buildkite/test_areas/pytorch.yaml
+++ b/.buildkite/test_areas/pytorch.yaml
@ -0,0 +1,52 @@
+group: PyTorch
+depends_on: 
+  - image-build
+steps:
+- label: PyTorch Compilation Unit Tests
+  timeout_in_minutes: 30
+  source_file_dependencies:
+    - vllm/
+    - tests/compile
+  commands:
+  # Run unit tests defined directly under compile/,
+  # not including subdirectories, which are usually heavier
+  # tests covered elsewhere.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  # However, find does not normally propagate error codes, so we combine it with xargs
+  # (using -0 for proper path handling)
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
+
+- label: PyTorch Fullgraph Smoke Test
+  timeout_in_minutes: 30
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+  # Run smoke tests under fullgraph directory, except test_full_graph.py
+  # as it is a heavy test that is covered in other steps.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
+
+- label: PyTorch Fullgraph
+  timeout_in_minutes: 40
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+    # fp8 kv scales not supported on sm89, tested on Blackwell instead
+  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+    # Limit to no custom ops to reduce running time
+    # Wrap with quotes to escape yaml and avoid starting -k string with a -
+  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+
+- label: Pytorch Nightly Dependency Override Check # 2min
+  # if this test fails, it means the nightly torch version is not compatible with some
+  # of the dependencies. Please check the error message and add the package to whitelist
+  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
+  soft_fail: true
+  source_file_dependencies:
+  - requirements/nightly_torch_test.txt
+  commands:
+  - bash standalone_tests/pytorch_nightly_dependency.sh
--- a/.buildkite/test_areas/quantization.yaml
+++ b/.buildkite/test_areas/quantization.yaml
@ -0,0 +1,46 @@
+group: Quantization
+depends_on: 
+  - image-build
+steps:
+- label: Quantization
+  timeout_in_minutes: 90
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  - tests/quantization
+  commands:
+  # temporary install here since we need nightly, will move to requirements/test.in
+  # after torchao 0.12 release, and pin a working version of torchao nightly here
+
+  # since torchao nightly is only compatible with torch nightly currently
+  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
+  # we can only upgrade after this is resolved
+  # TODO(jerryzh168): resolve the above comment
+  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system conch-triton-kernels
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
+
+- label: Quantized MoE Test (B200)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - tests/quantization/test_blackwell_moe.py
+  - vllm/model_executor/models/deepseek_v2.py
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/models/llama4.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization/compressed_tensors
+  - vllm/model_executor/layers/quantization/modelopt.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - pytest -s -v tests/quantization/test_blackwell_moe.py
+
+- label: Quantized Models Test
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - vllm/model_executor/layers/quantization
+  - tests/models/quantization
+  commands:
+    - pytest -v -s models/quantization
--- a/.buildkite/test_areas/samplers.yaml
+++ b/.buildkite/test_areas/samplers.yaml
@ -0,0 +1,14 @@
+group: Samplers
+depends_on: 
+  - image-build
+steps:
+- label: Samplers Test
+  timeout_in_minutes: 75
+  source_file_dependencies:
+  - vllm/model_executor/layers
+  - vllm/sampling_metadata.py
+  - tests/samplers
+  - tests/conftest.py
+  commands:
+    - pytest -v -s samplers
+    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
--- a/.buildkite/test_areas/weight_loading.yaml
+++ b/.buildkite/test_areas/weight_loading.yaml
@ -0,0 +1,25 @@
+group: Weight Loading
+depends_on: 
+  - image-build
+steps:
+- label: Weight Loading Multiple GPU  # 33min
+  timeout_in_minutes: 45
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+
+- label: Weight Loading Multiple GPU - Large Models # optional
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  gpu: a100
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -9,11 +9,13 @@
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
+/vllm/model_executor/layers/batch_invariant.py @yewentao256 
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm @chaunceyjiang
 /vllm/entrypoints @aarnphm @chaunceyjiang
+/vllm/tool_parsers @aarnphm @chaunceyjiang
 /vllm/compilation @zou3519 @youkaichao @ProExpertProg
 /vllm/distributed/kv_transfer @NickLucche @ApostaC
 CMakeLists.txt @tlrmchlsmth @LucasWilkinson
@ -35,6 +37,9 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /vllm/v1/kv_cache_interface.py @heheda12345
 /vllm/v1/offloading @ApostaC

+# Model runner V2
+/vllm/v1/worker/gpu @WoosukKwon
+
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin 
 /tests/distributed/test_multi_node_assignment.py @youkaichao
@ -56,6 +61,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/v1/kv_connector/nixl_integration @NickLucche
 /tests/v1/kv_connector @ApostaC
 /tests/v1/offloading @ApostaC
+/tests/v1/determinism @yewentao256 

 # Transformers modeling backend
 /vllm/model_executor/models/transformers @hmellor
@ -141,9 +147,10 @@ mkdocs.yaml @hmellor
 /requirements/kv_connectors.txt @NickLucche

 # Pooling models
-/examples/*/pooling/ @noooop
+/examples/pooling @noooop
 /tests/models/*/pooling* @noooop
 /tests/entrypoints/pooling @noooop
+/vllm/entrypoints/pooling @noooop
 /vllm/config/pooler.py @noooop
 /vllm/pooling_params.py @noooop
 /vllm/model_executor/layers/pooler.py @noooop
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -14,6 +14,52 @@ pull_request_rules:
    comment:
      message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"

+- name: comment-pre-commit-failure
+  description: Comment on PR when pre-commit check fails
+  conditions:
+    - status-failure=pre-commit
+    - -closed
+    - -draft
+  actions:
+    comment:
+      message: |
+        Hi @{{author}}, the pre-commit checks have failed. Please run:
+
+        ```bash 
+        uv pip install pre-commit
+        pre-commit install
+        pre-commit run --all-files
+        ```
+
+        Then, commit the changes and push to your branch.
+
+        For future commits, `pre-commit` will run automatically on changed files before each commit.
+
+        > [!TIP]
+        > <details>
+        > <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
+        > <br/>
+        > <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
+        >
+        > ```bash
+        > # For mypy (substitute "3.10" with the failing version if needed)
+        > pre-commit run --hook-stage manual mypy-3.10
+        > # For markdownlint
+        > pre-commit run --hook-stage manual markdownlint
+        > ```
+        > </details>
+
+- name: comment-dco-failure
+  description: Comment on PR when DCO check fails
+  conditions:
+    - status-failure=dco
+    - -closed
+    - -draft
+  actions:
+    comment:
+      message: |
+        Hi @{{author}}, the DCO check has failed. Please click on DCO in the Checks section for instructions on how to resolve this.
+
 - name: label-ci-build
  description: Automatically apply ci/build label
  conditions:
@ -140,7 +186,7 @@ pull_request_rules:
      - files~=^tests/entrypoints/test_context.py
      - files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
      - files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
-      - files~=^vllm/entrypoints/harmony_utils.py
+      - files~=^vllm/entrypoints/openai/parser/harmony_utils.py
      - files~=^vllm/entrypoints/tool_server.py
      - files~=^vllm/entrypoints/tool.py
      - files~=^vllm/entrypoints/context.py
@ -189,6 +235,20 @@ pull_request_rules:
      add:
        - rocm

+- name: label-cpu
+  description: Automatically apply cpu label
+  conditions:
+    - label != stale
+    - files~=^(?!.*kv_offload)(?!.*cpu_offload).*\bcpu.*
+  actions:
+    label:
+      add:
+        - cpu
+    assign:
+      users:
+        - "fadara01"
+        - "aditew01"
+
 - name: label-structured-output
  description: Automatically apply structured-output label
  conditions:
@ -289,6 +349,18 @@ pull_request_rules:
      add:
        - tool-calling

+- name: auto-rebase if approved, ready, and 40 commits behind main
+  conditions:
+    - base = main
+    - label=ready
+    - "#approved-reviews-by >= 1"
+    - "#commits-behind >= 40"
+    - -closed
+    - -draft
+    - -conflict
+  actions:
+    rebase: {}
+
 - name: ping author on conflicts and add 'needs-rebase' label
  conditions:
    - label != stale
@ -358,4 +430,4 @@ pull_request_rules:
  actions:
    label:
      add:
-        - kv-connector
+        - kv-connector
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@ -13,10 +13,10 @@ jobs:

    steps:
      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1

      - name: Set up Python
-        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
        with:
          python-version: '3.12'

--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@ -105,6 +105,31 @@ jobs:
                  }
                ],
              },
+              cpu: {
+                // Keyword search - matches whole words only (with word boundaries)
+                keywords: [
+                  {
+                    term: "CPU Backend",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "x86",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "ARM",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "Apple Silicon",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "IBM Z",
+                    searchIn: "title"
+                  },
+                ],
+              },
              // Add more label configurations here as needed
              // example: {
              //   keywords: [...],
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@ -12,7 +12,7 @@ jobs:
    timeout-minutes: 30

    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6.0.1

      - uses: astral-sh/setup-uv@v7
        with:
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -16,8 +16,8 @@ jobs:
  pre-commit:
    runs-on: ubuntu-latest
    steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-    - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+    - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+    - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
      with:
        python-version: "3.12"
    - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -7,13 +7,15 @@ on:

 jobs:
  close-issues-and-pull-requests:
+    # Prevents triggering on forks or other repos
+    if: github.repository == 'vllm-project/vllm'
    permissions:
      issues: write
      pull-requests: write
      actions: write
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
+      - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
        with:
          # Increasing this value ensures that changes to this workflow
          # propagate to all issues and PRs in days rather than months
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -56,8 +56,8 @@ endif()
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1")

 #
 # Try to find python package with an executable that exactly matches
@ -136,7 +136,7 @@ elseif(HIP_FOUND)

  # ROCm 5.X and 6.X
  if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
-      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
+      Torch_VERSION VERSION_LESS ${TORCH_SUPPORTED_VERSION_ROCM})
    message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
      "expected for ROCm build, saw ${Torch_VERSION} instead.")
  endif()
@ -307,7 +307,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")

  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
-  set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v4.2.1")

  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@ -354,9 +354,22 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # Only build Marlin kernels if we are building for at least some compatible archs.
  # Keep building Marlin for 9.0 as there are some group sizes and shapes that
  # are not supported by Machete yet.
-  # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
-  if (MARLIN_ARCHS)
+
+  # marlin arches for fp16 output
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+  # marlin has limited support for turing
+  cuda_archs_loose_intersection(MARLIN_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
+  # marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
+  cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+  # marlin arches for fp8 input
+  # - sm80 doesn't support fp8 computation
+  # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
+  # so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
+  cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
+  # marlin arches for other files
+  cuda_archs_loose_intersection(MARLIN_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}")
+
+  if (MARLIN_OTHER_ARCHS)

    #
    # For the Marlin kernels we automatically generate sources for various
@ -365,16 +378,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set(MARLIN_GEN_SCRIPT
      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
    file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
+    list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
+    set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")

-    message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
-    message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
+    message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")

-    if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
-        OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
+    if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+        OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
      execute_process(
        COMMAND ${CMAKE_COMMAND} -E env
-        PYTHONPATH=$PYTHONPATH
-          ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
+        PYTHONPATH=$ENV{PYTHONPATH}
+          ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
        RESULT_VARIABLE marlin_generation_result
        OUTPUT_VARIABLE marlin_generation_result
        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
@ -387,40 +402,76 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                            "\nCheck the log for details: "
                            "${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
      else()
-        set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
-            CACHE STRING "Last run Marlin generate script hash" FORCE)
+        set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+            CACHE STRING "Last run Marlin generate script hash and arch" FORCE)
        message(STATUS "Marlin generation completed successfully.")
      endif()
    else()
      message(STATUS "Marlin generation script has not changed, skipping generation.")
    endif()

-    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
-      CUDA_ARCHS "${MARLIN_ARCHS}")
-    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
-      set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
-        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    if (MARLIN_ARCHS)
+      file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
+        CUDA_ARCHS "${MARLIN_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
+
+      file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
+        CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
    endif()

-    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
+    if (MARLIN_SM75_ARCHS) 
+      file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/gptq_marlin/sm75_kernel_*.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
+        CUDA_ARCHS "${MARLIN_SM75_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_TEMPLATE_SM75_KERNEL_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_SM75_KERNEL_SRC})
+    endif()
+
+    if (MARLIN_FP8_ARCHS) 
+      file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
+        CUDA_ARCHS "${MARLIN_FP8_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_TEMPLATE_FP8_KERNEL_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_FP8_KERNEL_SRC})
+    endif()

    set(MARLIN_SRCS
       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
       "csrc/quantization/gptq_marlin/gptq_marlin.cu"
+       "csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu"
       "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
       "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
    set_gencode_flags_for_srcs(
      SRCS "${MARLIN_SRCS}"
-      CUDA_ARCHS "${MARLIN_ARCHS}")
+      CUDA_ARCHS "${MARLIN_OTHER_ARCHS}")
    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
-      set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu"
+      set_source_files_properties(${MARLIN_SRCS}
        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
    endif()
    list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")

-    message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
+    message(STATUS "Building Marlin kernels for archs: ${MARLIN_OTHER_ARCHS}")
  else()
    message(STATUS "Not building Marlin kernels as no compatible archs found"
                   " in CUDA target architectures")
@ -604,12 +655,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
      "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
-      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
+      "csrc/quantization/fp4/nvfp4_experts_quant.cu"
+      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu"
+      "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${FP4_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1")
    message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
  else()
    message(STATUS "Not building NVFP4 as no compatible archs were found.")
@ -745,24 +799,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  else()
    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
  endif()
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
-    message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}")
-  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-      message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is "
-                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
-                     "if you intend on running FP8 quantized MoE models on Blackwell.")
-    else()
-      message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found "
-                     "in CUDA target architectures")
-    endif()
-  endif()

  #
  # Machete kernels
@ -786,7 +822,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
        OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
      execute_process(
        COMMAND ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
+        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$ENV{PYTHONPATH}
          ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
        RESULT_VARIABLE machete_generation_result
        OUTPUT_VARIABLE machete_generation_output
@ -838,7 +874,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
    set(SRCS
-       "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
+       "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu"
+       "csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu"
+       "csrc/quantization/cutlass_w4a8/w4a8_utils.cu"
+       )

    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@ -908,7 +947,6 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 set(VLLM_MOE_EXT_SRC
  "csrc/moe/torch_bindings.cpp"
  "csrc/moe/moe_align_sum_kernels.cu"
-  "csrc/moe/moe_lora_align_sum_kernels.cu"
  "csrc/moe/topk_softmax_kernels.cu")

 if(VLLM_GPU_LANG STREQUAL "CUDA")
@ -938,9 +976,20 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    CUDA_ARCHS "${CUDA_ARCHS}")

  list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
-  # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
-  if (MARLIN_MOE_ARCHS)
+  # moe marlin arches
+  # note that we always set `use_atomic_add=False` for moe marlin now,
+  # so we don't need 9.0 for bf16 atomicAdd PTX
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+  # moe marlin has limited support for turing
+  cuda_archs_loose_intersection(MARLIN_MOE_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
+  # moe marlin arches for fp8 input
+  # - sm80 doesn't support fp8 computation
+  # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
+  # so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
+  cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
+  # moe marlin arches for other files
+  cuda_archs_loose_intersection(MARLIN_MOE_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}")
+  if (MARLIN_MOE_OTHER_ARCHS)

    #
    # For the Marlin MOE kernels we automatically generate sources for various
@ -949,16 +998,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set(MOE_MARLIN_GEN_SCRIPT
      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
    file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
+    list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
+    set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MOE_MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")

-    message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
-    message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Marlin MOE generation script hash with arch: ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
+    message(STATUS "Last run Marlin MOE generate script hash with arch: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")

-    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
-        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
+    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
      execute_process(
        COMMAND ${CMAKE_COMMAND} -E env
-        PYTHONPATH=$PYTHONPATH
-          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
+        PYTHONPATH=$ENV{PYTHONPATH}
+          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
        RESULT_VARIABLE moe_marlin_generation_result
        OUTPUT_VARIABLE moe_marlin_generation_output
        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
@ -971,7 +1022,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                            "\nCheck the log for details: "
                            "${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
      else()
-        set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
+        set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
            CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
        message(STATUS "Marlin MOE generation completed successfully.")
      endif()
@ -979,18 +1030,53 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
    endif()

-    file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${MOE_WNAA16_MARLIN_SRC}"
-      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
-    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
-      set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
-        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    if (MARLIN_MOE_ARCHS)
+      file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_MOE_SRC}"
+        CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_MOE_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
    endif()

-    list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
+    if (MARLIN_MOE_SM75_ARCHS) 
+      file(GLOB MARLIN_MOE_SM75_SRC "csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_MOE_SM75_SRC}"
+        CUDA_ARCHS "${MARLIN_MOE_SM75_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_MOE_SM75_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SM75_SRC})
+    endif()

-    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
+    if (MARLIN_MOE_FP8_ARCHS)
+      file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_MOE_FP8_SRC}"
+        CUDA_ARCHS "${MARLIN_MOE_FP8_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_MOE_FP8_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC})
+    endif()
+
+    set(MARLIN_MOE_OTHER_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${MARLIN_MOE_OTHER_SRC}"
+      CUDA_ARCHS "${MARLIN_MOE_OTHER_ARCHS}")
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+      set_source_files_properties(${MARLIN_MOE_OTHER_SRC}
+        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    endif()
+    list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_OTHER_SRC}")
+
+    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_OTHER_ARCHS}")
  else()
    message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
                   " in CUDA target architectures")
--- a/README.md
+++ b/README.md
@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio

 *Latest News* 🔥

+- [2025/11] We hosted [vLLM Bangkok Meetup](https://luma.com/v0f647nv). We explored vLLM and LMCache inference and low-resource language adaptation with speakers from Embedded LLM, AMD, and Red Hat. Please find the meetup slides [here](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing).
 - [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
 - [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
 - [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
@ -136,16 +137,19 @@ Compute Resources:
 - Alibaba Cloud
 - AMD
 - Anyscale
+- Arm
 - AWS
 - Crusoe Cloud
 - Databricks
 - DeepInfra
 - Google Cloud
+- IBM
 - Intel
 - Lambda Lab
 - Nebius
 - Novita AI
 - NVIDIA
+- Red Hat
 - Replicate
 - Roblox
 - RunPod
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@ -83,7 +83,7 @@ MIN_CACHE_HIT_PCT=0
 MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
 ```

-#### 2. Maximize Throughput with a Latency Requirement
+### 2. Maximize Throughput with a Latency Requirement

 - **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
 - **Configuration**:
@ -96,7 +96,7 @@ MIN_CACHE_HIT_PCT=0
 MAX_LATENCY_ALLOWED_MS=500
 ```

-#### 3. Maximize Throughput with Prefix Caching and Latency Requirements
+### 3. Maximize Throughput with Prefix Caching and Latency Requirements

 - **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
 - **Configuration**:
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@ -18,6 +18,11 @@ MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
 MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
 NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
 NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
+HOSTNAME=$(hostname)
+if [[ -z "$HOSTNAME" ]]; then
+    echo "Error: Failed to determine hostname." >&2
+    exit 1
+fi

 LOG_FOLDER="$BASE/auto-benchmark/$TAG"
 RESULT="$LOG_FOLDER/result.txt"
@ -82,6 +87,7 @@ start_server() {
        "$MODEL"
        "--disable-log-requests"
        "--port" "8004"
+        "--host" "$HOSTNAME"
        "--gpu-memory-utilization" "$gpu_memory_utilization"
        "--max-num-seqs" "$max_num_seqs"
        "--max-num-batched-tokens" "$max_num_batched_tokens"
@ -96,8 +102,9 @@ start_server() {
    # This correctly passes each element as a separate argument.
    if [[ -n "$profile_dir" ]]; then
        # Start server with profiling enabled
-        VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
-            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
+        local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}"
+        VLLM_SERVER_DEV_MODE=1 \
+            vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
    else
        # Start server without profiling
        VLLM_SERVER_DEV_MODE=1 \
@ -112,7 +119,7 @@ start_server() {
        # since that we should always have permission to send signal to the server process.
        kill -0 $server_pid 2> /dev/null || break

-        RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
+        RESPONSE=$(curl -s -X GET "http://${HOSTNAME}:8004/health" -w "%{http_code}" -o /dev/stdout)
        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
        if [[ "$STATUS_CODE" -eq 200 ]]; then
            server_started=1
@ -172,6 +179,7 @@ run_benchmark() {
        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
        --num-prompts 1000 \
        --random-prefix-len $prefix_len \
+        --host "$HOSTNAME" \
        --port 8004 &> "$bm_log"
    throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
    e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
@ -187,7 +195,7 @@ run_benchmark() {
        request_rate=$((${throughput%.*} + 1))
        while ((request_rate > 0)); do
            # clear prefix cache
-            curl -X POST http://0.0.0.0:8004/reset_prefix_cache
+            curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache
            sleep 5
            bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
            vllm bench serve \
@ -203,6 +211,7 @@ run_benchmark() {
                --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
                --num-prompts 100 \
                --random-prefix-len $prefix_len \
+                --host "$HOSTNAME" \
                --port 8004 &> "$bm_log"
            throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
            e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
@ -303,6 +312,7 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
        --num-prompts 100 \
        --random-prefix-len $prefix_len \
+        --host "$HOSTNAME" \
        --port 8004 \
        --profile &> "$bm_log"
 else
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -620,7 +620,7 @@ def get_tokenizer(
        kwargs["use_fast"] = False
    if tokenizer_mode == "mistral":
        try:
-            from vllm.transformers_utils.tokenizer import MistralTokenizer
+            from vllm.tokenizers.mistral import MistralTokenizer
        except ImportError as e:
            raise ImportError(
                "MistralTokenizer requires vllm package.\n"
--- a/benchmarks/benchmark_batch_invariance.py
+++ b/benchmarks/benchmark_batch_invariance.py
@ -104,7 +104,6 @@ def run_benchmark_with_batch_invariant(
    random.seed(seed)

    # Set environment variables
-    os.environ["VLLM_ATTENTION_BACKEND"] = backend
    if batch_invariant:
        os.environ["VLLM_BATCH_INVARIANT"] = "1"
    else:
@ -140,6 +139,7 @@ def run_benchmark_with_batch_invariant(
            max_model_len=max_model_len,
            dtype="bfloat16",
            tensor_parallel_size=tp_size,
+            attention_config={"backend": backend},
            enable_prefix_caching=False,
        )
        init_time = time.perf_counter() - start_init
--- a/benchmarks/benchmark_hash.py
+++ b/benchmarks/benchmark_hash.py
@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Micro benchmark comparing built-in hash(), SHA-256, and xxHash.
+
+This focuses on a single test payload shaped like the prefix-cache hash input:
+    (32-byte bytes object, 32-int tuple)
+
+Usage:
+    python benchmarks/hash_micro_benchmark.py --iterations 20000
+"""
+
+from __future__ import annotations
+
+import argparse
+import random
+import statistics
+import time
+from collections.abc import Callable, Iterable
+
+from vllm.utils.hashing import sha256, xxhash
+
+
+def _generate_test_data(seed: int) -> tuple[bytes, tuple[int, ...]]:
+    """Generate a deterministic test payload."""
+    random.seed(seed)
+    bytes_data = bytes(random.getrandbits(8) for _ in range(32))
+    int_tuple = tuple(random.randint(1, 1_000_000) for _ in range(32))
+    return (bytes_data, int_tuple)
+
+
+def _benchmark_func(func: Callable[[tuple], object], data: tuple, iterations: int):
+    """Return (avg_seconds, std_seconds) for hashing `data` `iterations` times."""
+    times: list[float] = []
+
+    # Warm-up to avoid first-run noise.
+    for _ in range(200):
+        func(data)
+
+    for _ in range(iterations):
+        start = time.perf_counter()
+        func(data)
+        end = time.perf_counter()
+        times.append(end - start)
+
+    avg = statistics.mean(times)
+    std = statistics.stdev(times) if len(times) > 1 else 0.0
+    return avg, std
+
+
+def _run_benchmarks(
+    benchmarks: Iterable[tuple[str, Callable[[tuple], object]]],
+    data: tuple,
+    iterations: int,
+):
+    """Yield (name, avg, std) for each benchmark, skipping unavailable ones."""
+    for name, func in benchmarks:
+        try:
+            avg, std = _benchmark_func(func, data, iterations)
+        except ModuleNotFoundError as exc:
+            print(f"Skipping {name}: {exc}")
+            continue
+        yield name, avg, std
+
+
+def builtin_hash(data: tuple) -> int:
+    """Wrapper for Python's built-in hash()."""
+    return hash(data)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--iterations",
+        type=int,
+        default=10_000,
+        help="Number of measured iterations per hash function.",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=42, help="Random seed for test payload."
+    )
+    args = parser.parse_args()
+
+    data = _generate_test_data(args.seed)
+    benchmarks = (
+        ("SHA256 (pickle)", sha256),
+        ("xxHash (pickle)", xxhash),
+        ("built-in hash()", builtin_hash),
+    )
+
+    print("=" * 60)
+    print("HASH FUNCTION MICRO BENCHMARK")
+    print("=" * 60)
+    print("Test data: (32-byte bytes object, 32-int tuple)")
+    print(f"Iterations: {args.iterations:,}")
+    print("=" * 60)
+
+    results = list(_run_benchmarks(benchmarks, data, args.iterations))
+    builtin_entry = next((r for r in results if r[0] == "built-in hash()"), None)
+
+    print("\nResults:")
+    for name, avg, std in results:
+        print(f"  {name:16s}: {avg * 1e6:8.2f} ± {std * 1e6:6.2f} μs")
+
+    if builtin_entry:
+        _, builtin_avg, _ = builtin_entry
+        print("\n" + "=" * 60)
+        print("SUMMARY (relative to built-in hash())")
+        print("=" * 60)
+        for name, avg, _ in results:
+            if name == "built-in hash()":
+                continue
+            speed_ratio = avg / builtin_avg
+            print(f"• {name} is {speed_ratio:.1f}x slower than built-in hash()")
+    else:
+        print("\nBuilt-in hash() result missing; cannot compute speed ratios.")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@ -32,12 +32,11 @@ def benchmark_propose(args):

        model_config = ModelConfig(
            model="facebook/opt-125m",
-            task="generate",
            max_model_len=args.num_token + args.num_spec_token,
            tokenizer="facebook/opt-125m",
            tokenizer_mode="auto",
            dtype="auto",
-            seed=None,
+            seed=0,
            trust_remote_code=False,
        )
        proposer = NgramProposer(
@ -108,7 +107,10 @@ def benchmark_batched_propose(args):
        device_config=DeviceConfig(device=current_platform.device_type),
        parallel_config=ParallelConfig(),
        load_config=LoadConfig(),
-        scheduler_config=SchedulerConfig(),
+        scheduler_config=SchedulerConfig(
+            max_model_len=model_config.max_model_len,
+            is_encoder_decoder=model_config.is_encoder_decoder,
+        ),
    )

    # monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
--- a/benchmarks/benchmark_prefix_block_hash.py
+++ b/benchmarks/benchmark_prefix_block_hash.py
@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Simple benchmark to compare prefix-cache block hashing algorithms.
+
+Example:
+    python benchmark_prefix_block_hash.py --num-blocks 20000 --block-size 32
+"""
+
+from __future__ import annotations
+
+import argparse
+import random
+import statistics
+import sys
+import time
+from collections.abc import Callable, Iterable, Sequence
+
+from vllm.utils.hashing import get_hash_fn_by_name
+from vllm.v1.core.kv_cache_utils import BlockHash, hash_block_tokens, init_none_hash
+
+SUPPORTED_ALGOS = ("sha256", "sha256_cbor", "xxhash", "xxhash_cbor")
+
+
+def _generate_blocks(
+    num_blocks: int, block_size: int, vocab_size: int, seed: int
+) -> list[list[int]]:
+    rng = random.Random(seed)
+    return [
+        [rng.randrange(vocab_size) for _ in range(block_size)]
+        for _ in range(num_blocks)
+    ]
+
+
+def _hash_all_blocks(
+    hash_fn: Callable[[object], bytes],
+    blocks: Iterable[Sequence[int]],
+) -> float:
+    parent_hash: BlockHash | None = None
+    start = time.perf_counter()
+    for block in blocks:
+        parent_hash = hash_block_tokens(hash_fn, parent_hash, block, extra_keys=None)
+    end = time.perf_counter()
+    return end - start
+
+
+def _benchmark(
+    hash_algo: str,
+    blocks: list[list[int]],
+    trials: int,
+) -> tuple[float, float, float] | None:
+    try:
+        hash_fn = get_hash_fn_by_name(hash_algo)
+        init_none_hash(hash_fn)
+        timings = [_hash_all_blocks(hash_fn, blocks) for _ in range(trials)]
+    except ModuleNotFoundError as exc:
+        print(f"Skipping {hash_algo}: {exc}", file=sys.stderr)
+        return None
+
+    avg = statistics.mean(timings)
+    best = min(timings)
+    # throughput: tokens / second
+    tokens_hashed = len(blocks) * len(blocks[0])
+    throughput = tokens_hashed / best
+    return avg, best, throughput
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--num-blocks", type=int, default=10000, help="Block count.")
+    parser.add_argument("--block-size", type=int, default=32, help="Tokens per block.")
+    parser.add_argument(
+        "--vocab-size", type=int, default=32000, help="Token id range [0, vocab_size)."
+    )
+    parser.add_argument("--seed", type=int, default=0, help="Random seed.")
+    parser.add_argument(
+        "--trials", type=int, default=5, help="Number of timed trials per algorithm."
+    )
+    parser.add_argument(
+        "--algorithms",
+        nargs="+",
+        default=SUPPORTED_ALGOS,
+        choices=SUPPORTED_ALGOS,
+        help="Hash algorithms to benchmark.",
+    )
+    args = parser.parse_args()
+
+    blocks = _generate_blocks(
+        args.num_blocks, args.block_size, args.vocab_size, args.seed
+    )
+    print(
+        f"Benchmarking {len(args.algorithms)} algorithms on "
+        f"{args.num_blocks} blocks (block size={args.block_size})."
+    )
+
+    for algo in args.algorithms:
+        result = _benchmark(algo, blocks, args.trials)
+        if result is None:
+            continue
+
+        avg, best, throughput = result
+        print(
+            f"{algo:14s} avg: {avg:.6f}s  best: {best:.6f}s  "
+            f"throughput: {throughput / 1e6:.2f}M tokens/s"
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@ -40,7 +40,7 @@ from vllm.engine.arg_utils import EngineArgs
 from vllm.utils.argparse_utils import FlexibleArgumentParser

 try:
-    from vllm.transformers_utils.tokenizer import get_tokenizer
+    from vllm.tokenizers import get_tokenizer
 except ImportError:
    from backend_request_func import get_tokenizer

--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -46,7 +46,7 @@ from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase

 try:
-    from vllm.transformers_utils.tokenizer import get_tokenizer
+    from vllm.tokenizers import get_tokenizer
 except ImportError:
    from backend_request_func import get_tokenizer

@ -574,7 +574,7 @@ async def benchmark(
    )
    print(
        "{:<40} {:<10.2f}".format(
-            "Total Token throughput (tok/s):", metrics.total_token_throughput
+            "Total token throughput (tok/s):", metrics.total_token_throughput
        )
    )

@ -963,8 +963,7 @@ def create_argument_parser():
    parser.add_argument(
        "--profile",
        action="store_true",
-        help="Use Torch Profiler. The endpoint must be launched with "
-        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+        help="Use vLLM Profiling. --profiler-config must be provided on the server.",
    )
    parser.add_argument(
        "--result-dir",
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@ -5,11 +5,12 @@ import argparse
 import asyncio
 import logging
 import os
+import time
+import uuid
+from urllib.parse import urlparse

 import aiohttp
 from quart import Quart, Response, make_response, request
-from rate_limiter import RateLimiter
-from request_queue import RequestQueue

 # Configure logging
 logging.basicConfig(level=logging.INFO)
@ -24,26 +25,8 @@ def parse_args():
    parser.add_argument(
        "--timeout",
        type=float,
-        default=300,
-        help="Timeout for backend service requests in seconds (default: 300)",
-    )
-    parser.add_argument(
-        "--max-concurrent",
-        type=int,
-        default=100,
-        help="Maximum concurrent requests to backend services (default: 100)",
-    )
-    parser.add_argument(
-        "--queue-size",
-        type=int,
-        default=500,
-        help="Maximum number of requests in the queue (default: 500)",
-    )
-    parser.add_argument(
-        "--rate-limit",
-        type=int,
-        default=40,
-        help="Maximum requests per second (default: 40)",
+        default=6 * 60 * 60,
+        help="Timeout for backend service requests in seconds (default: 21600)",
    )
    parser.add_argument(
        "--port",
@ -54,14 +37,32 @@ def parse_args():
    parser.add_argument(
        "--prefill-url",
        type=str,
-        default="http://localhost:8100/v1/completions",
-        help="Prefill service endpoint URL",
+        default="http://localhost:8100",
+        help="Prefill service base URL (protocol + host[:port])",
    )
    parser.add_argument(
        "--decode-url",
        type=str,
-        default="http://localhost:8200/v1/completions",
-        help="Decode service endpoint URL",
+        default="http://localhost:8200",
+        help="Decode service base URL (protocol + host[:port])",
+    )
+    parser.add_argument(
+        "--kv-host",
+        type=str,
+        default="localhost",
+        help="Hostname or IP used by KV transfer (default: localhost)",
+    )
+    parser.add_argument(
+        "--prefill-kv-port",
+        type=int,
+        default=14579,
+        help="Prefill KV port (default: 14579)",
+    )
+    parser.add_argument(
+        "--decode-kv-port",
+        type=int,
+        default=14580,
+        help="Decode KV port (default: 14580)",
    )

    return parser.parse_args()
@ -73,70 +74,129 @@ def main():

    # Initialize configuration using command line parameters
    AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout)
-    MAX_CONCURRENT_REQUESTS = args.max_concurrent
-    REQUEST_QUEUE_SIZE = args.queue_size
-    RATE_LIMIT = args.rate_limit
    PREFILL_SERVICE_URL = args.prefill_url
    DECODE_SERVICE_URL = args.decode_url
    PORT = args.port

+    PREFILL_KV_ADDR = f"{args.kv_host}:{args.prefill_kv_port}"
+    DECODE_KV_ADDR = f"{args.kv_host}:{args.decode_kv_port}"
+
+    logger.info(
+        "Proxy resolved KV addresses -> prefill: %s, decode: %s",
+        PREFILL_KV_ADDR,
+        DECODE_KV_ADDR,
+    )
+
    app = Quart(__name__)

-    # Initialize the rate limiter and request queue
-    rate_limiter = RateLimiter(RATE_LIMIT)
-    request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE)
-
-    # Attach the configuration object to the application instance
+    # Attach the configuration object to the application instance so helper
+    # coroutines can read the resolved backend URLs and timeouts without using
+    # globals.
    app.config.update(
        {
            "AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT,
-            "rate_limiter": rate_limiter,
-            "request_queue": request_queue,
            "PREFILL_SERVICE_URL": PREFILL_SERVICE_URL,
            "DECODE_SERVICE_URL": DECODE_SERVICE_URL,
+            "PREFILL_KV_ADDR": PREFILL_KV_ADDR,
+            "DECODE_KV_ADDR": DECODE_KV_ADDR,
        }
    )

-    # Start queue processing on app startup
-    @app.before_serving
-    async def startup():
-        """Start request processing task when app starts serving"""
-        asyncio.create_task(request_queue.process())
+    def _normalize_base_url(url: str) -> str:
+        """Remove any trailing slash so path joins behave predictably."""
+        return url.rstrip("/")

-    async def forward_request(url, data):
-        """Forward request to backend service with rate limiting and error handling"""
-        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+    def _get_host_port(url: str) -> str:
+        """Return the hostname:port portion for logging and KV headers."""
+        parsed = urlparse(url)
+        host = parsed.hostname or "localhost"
+        port = parsed.port
+        if port is None:
+            port = 80 if parsed.scheme == "http" else 443
+        return f"{host}:{port}"

-        # Use rate limiter as context manager
-        async with (
-            rate_limiter,
-            aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
-        ):
-            try:
-                async with session.post(
-                    url=url, json=data, headers=headers
-                ) as response:
-                    if response.status == 200:
-                        # Stream response chunks
-                        async for chunk_bytes in response.content.iter_chunked(1024):
-                            yield chunk_bytes
-                    else:
-                        # Handle backend service errors
-                        error_text = await response.text()
-                        logger.error(
-                            "Backend service error: %s - %s",
-                            response.status,
-                            error_text,
-                        )
-                        yield b'{"error": "Backend service error"}'
-            except aiohttp.ClientError as e:
-                # Handle connection errors
-                logger.error("Connection error to %s: %s", url, str(e))
-                yield b'{"error": "Service unavailable"}'
-            except asyncio.TimeoutError:
-                # Handle timeout errors
-                logger.error("Timeout connecting to %s", url)
-                yield b'{"error": "Service timeout"}'
+    PREFILL_BASE = _normalize_base_url(PREFILL_SERVICE_URL)
+    DECODE_BASE = _normalize_base_url(DECODE_SERVICE_URL)
+    KV_TARGET = _get_host_port(DECODE_SERVICE_URL)
+
+    def _build_headers(request_id: str) -> dict[str, str]:
+        """Construct the headers expected by vLLM's P2P disagg connector."""
+        headers: dict[str, str] = {"X-Request-Id": request_id, "X-KV-Target": KV_TARGET}
+        api_key = os.environ.get("OPENAI_API_KEY")
+        if api_key:
+            headers["Authorization"] = f"Bearer {api_key}"
+        return headers
+
+    async def _run_prefill(
+        request_path: str,
+        payload: dict,
+        headers: dict[str, str],
+        request_id: str,
+    ):
+        url = f"{PREFILL_BASE}{request_path}"
+        start_ts = time.perf_counter()
+        logger.info("[prefill] start request_id=%s url=%s", request_id, url)
+        try:
+            async with (
+                aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
+                session.post(url=url, json=payload, headers=headers) as resp,
+            ):
+                if resp.status != 200:
+                    error_text = await resp.text()
+                    raise RuntimeError(
+                        f"Prefill backend error {resp.status}: {error_text}"
+                    )
+                await resp.read()
+                logger.info(
+                    "[prefill] done request_id=%s status=%s elapsed=%.2fs",
+                    request_id,
+                    resp.status,
+                    time.perf_counter() - start_ts,
+                )
+        except asyncio.TimeoutError as exc:
+            raise RuntimeError(f"Prefill service timeout at {url}") from exc
+        except aiohttp.ClientError as exc:
+            raise RuntimeError(f"Prefill service unavailable at {url}") from exc
+
+    async def _stream_decode(
+        request_path: str,
+        payload: dict,
+        headers: dict[str, str],
+        request_id: str,
+    ):
+        url = f"{DECODE_BASE}{request_path}"
+        # Stream tokens from the decode service once the prefill stage has
+        # materialized KV caches on the target workers.
+        logger.info("[decode] start request_id=%s url=%s", request_id, url)
+        try:
+            async with (
+                aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
+                session.post(url=url, json=payload, headers=headers) as resp,
+            ):
+                if resp.status != 200:
+                    error_text = await resp.text()
+                    logger.error(
+                        "Decode backend error %s - %s", resp.status, error_text
+                    )
+                    err_msg = (
+                        '{"error": "Decode backend error ' + str(resp.status) + '"}'
+                    )
+                    yield err_msg.encode()
+                    return
+                logger.info(
+                    "[decode] streaming response request_id=%s status=%s",
+                    request_id,
+                    resp.status,
+                )
+                async for chunk_bytes in resp.content.iter_chunked(1024):
+                    yield chunk_bytes
+                logger.info("[decode] finished streaming request_id=%s", request_id)
+        except asyncio.TimeoutError:
+            logger.error("Decode service timeout at %s", url)
+            yield b'{"error": "Decode service timeout"}'
+        except aiohttp.ClientError as exc:
+            logger.error("Decode service error at %s: %s", url, exc)
+            yield b'{"error": "Decode service unavailable"}'

    async def process_request():
        """Process a single request through prefill and decode stages"""
@ -146,13 +206,27 @@ def main():
            # Create prefill request (max_tokens=1)
            prefill_request = original_request_data.copy()
            prefill_request["max_tokens"] = 1
+            if "max_completion_tokens" in prefill_request:
+                prefill_request["max_completion_tokens"] = 1

            # Execute prefill stage
-            async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request):
-                continue
+            # The request id encodes both KV socket addresses so the backend can
+            # shuttle tensors directly via NCCL once the prefill response
+            # completes.
+            request_id = (
+                f"___prefill_addr_{PREFILL_KV_ADDR}___decode_addr_"
+                f"{DECODE_KV_ADDR}_{uuid.uuid4().hex}"
+            )
+
+            headers = _build_headers(request_id)
+            await _run_prefill(request.path, prefill_request, headers, request_id)

            # Execute decode stage and stream response
-            generator = forward_request(DECODE_SERVICE_URL, original_request_data)
+            # Pass the unmodified user request so the decode phase can continue
+            # sampling with the already-populated KV cache.
+            generator = _stream_decode(
+                request.path, original_request_data, headers, request_id
+            )
            response = await make_response(generator)
            response.timeout = None  # Disable timeout for streaming response
            return response
@ -168,23 +242,10 @@ def main():
    @app.route("/v1/completions", methods=["POST"])
    async def handle_request():
        """Handle incoming API requests with concurrency and rate limiting"""
-        # Create task for request processing
-        task = asyncio.create_task(process_request())
-
-        # Enqueue request or reject if queue is full
-        if not await request_queue.enqueue(task):
-            return Response(
-                response=b'{"error": "Server busy, try again later"}',
-                status=503,
-                content_type="application/json",
-            )
-
        try:
-            # Return the response from the processing task
-            return await task
+            return await process_request()
        except asyncio.CancelledError:
-            # Handle task cancellation (timeout or queue full)
-            logger.warning("Request cancelled due to timeout or queue full")
+            logger.warning("Request cancelled")
            return Response(
                response=b'{"error": "Request cancelled"}',
                status=503,
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@ -14,6 +14,9 @@ from tqdm import tqdm

 import vllm._custom_ops as ops
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8,
+)


@dataclass
@ -22,6 +25,7 @@ class bench_params_t:
    hidden_size: int
    add_residual: bool
    dtype: torch.dtype
+    group_size: list[int]

    def description(self):
        return (
@ -29,6 +33,7 @@ class bench_params_t:
            f"x D {self.hidden_size} "
            f"x R {self.add_residual} "
            f"x DT {self.dtype}"
+            f"x GS {self.group_size}"
        )


@ -38,10 +43,11 @@ def get_bench_params() -> list[bench_params_t]:
    HIDDEN_SIZES = list(range(1024, 8129, 1024))
    ADD_RESIDUAL = [True, False]
    DTYPES = [torch.bfloat16, torch.float]
+    GROUP_SIZES = [[1, 64], [1, 128]]

-    combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
+    combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES, GROUP_SIZES)
    bench_params = list(
-        map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations)
+        map(lambda x: bench_params_t(x[0], x[1], x[2], x[3], x[4]), combinations)
    )
    return bench_params

@ -52,6 +58,7 @@ def unfused_int8_impl(
    x: torch.Tensor,
    residual: torch.Tensor | None,
    quant_dtype: torch.dtype,
+    group_size: list[int],
 ):
    # Norm
    torch_out = None
@ -69,6 +76,7 @@ def unfused_fp8_impl(
    x: torch.Tensor,
    residual: torch.Tensor | None,
    quant_dtype: torch.dtype,
+    group_size: list[int],
 ):
    # Norm
    torch_out = None
@ -81,23 +89,63 @@ def unfused_fp8_impl(
    torch_out, _ = ops.scaled_fp8_quant(torch_out)


+def unfused_groupwise_fp8_impl(
+    rms_norm_layer: RMSNorm,
+    x: torch.Tensor,
+    residual: torch.Tensor | None,
+    quant_dtype: torch.dtype,
+    group_size: list[int],
+):
+    # Norm
+    torch_out = None
+    if residual is None:
+        torch_out = rms_norm_layer.forward_cuda(x, residual)
+    else:
+        torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
+
+    # Quant
+    torch_out, _ = per_token_group_quant_fp8(
+        torch_out, group_size=group_size[1], use_ue8m0=False
+    )
+
+
 def fused_impl(
    rms_norm_layer: RMSNorm,  # this stores the weights
    x: torch.Tensor,
    residual: torch.Tensor | None,
    quant_dtype: torch.dtype,
+    group_size: list[int],
 ):
    out, _ = ops.rms_norm_dynamic_per_token_quant(
        x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual
    )


+def fused_groupwise_impl(
+    rms_norm_layer: RMSNorm,  # this stores the weights
+    x: torch.Tensor,
+    residual: torch.Tensor | None,
+    quant_dtype: torch.dtype,
+    group_size: list[int],
+):
+    out, _ = ops.rms_norm_per_block_quant(
+        x,
+        rms_norm_layer.weight,
+        1e-6,
+        quant_dtype,
+        group_size,
+        residual=residual,
+        is_scale_transposed=True,
+    )
+
+
 # Bench functions
 def bench_fn(
    rms_norm_layer: RMSNorm,
    x: torch.Tensor,
    residual: torch.Tensor,
    quant_dtype: torch.dtype,
+    group_size: list[int],
    label: str,
    sub_label: str,
    fn: Callable,
@ -110,10 +158,11 @@ def bench_fn(
        "x": x,
        "residual": residual,
        "quant_dtype": quant_dtype,
+        "group_size": group_size,
        "fn": fn,
    }
    return TBenchmark.Timer(
-        stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
+        stmt="fn(rms_norm_layer, x, residual, quant_dtype, group_size)",
        globals=globals,
        label=label,
        sub_label=sub_label,
@ -147,6 +196,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
            x,
            residual,
            torch.int8,
+            params.group_size,
            label,
            sub_label,
            unfused_int8_impl,
@ -161,6 +211,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
            x,
            residual,
            torch.float8_e4m3fn,
+            params.group_size,
            label,
            sub_label,
            unfused_fp8_impl,
@ -175,6 +226,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
            x,
            residual,
            torch.int8,
+            params.group_size,
            label,
            sub_label,
            fused_impl,
@ -189,6 +241,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
            x,
            residual,
            torch.float8_e4m3fn,
+            params.group_size,
            label,
            sub_label,
            fused_impl,
@ -196,6 +249,36 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
        )
    )

+    # unfused groupwise fp8 impl.
+    timers.append(
+        bench_fn(
+            layer,
+            x,
+            residual,
+            torch.float8_e4m3fn,
+            params.group_size,
+            label,
+            sub_label,
+            unfused_groupwise_fp8_impl,
+            "unfused_groupwise_fp8_impl",
+        )
+    )
+
+    # fused groupwise fp8 impl.
+    timers.append(
+        bench_fn(
+            layer,
+            x,
+            residual,
+            torch.float8_e4m3fn,
+            params.group_size,
+            label,
+            sub_label,
+            fused_groupwise_impl,
+            "fused_groupwise_fp8_impl",
+        )
+    )
+
    print_timers(timers)

    return timers
--- a/benchmarks/kernels/bench_nvfp4_quant.py
+++ b/benchmarks/kernels/bench_nvfp4_quant.py
@ -0,0 +1,177 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import copy
+import itertools
+
+import torch
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+from vllm.triton_utils import triton
+from vllm.utils.flashinfer import flashinfer_fp4_quantize
+
+if not current_platform.has_device_capability(100):
+    raise RuntimeError("NVFP4 requires compute capability of 10.0 (Blackwell)")
+
+FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+
+PROVIDER_CFGS = {
+    "vllm": dict(backend="vllm", enabled=True),
+    "flashinfer": dict(backend="flashinfer", enabled=True),
+}
+
+_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
+
+
+def compute_global_scale(tensor: torch.Tensor) -> torch.Tensor:
+    """Compute global scale for FP4 quantization."""
+    amax = torch.abs(tensor).max().to(torch.float32)
+    return FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / amax
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096],
+        x_log=False,
+        line_arg="provider",
+        line_vals=_enabled,
+        line_names=_enabled,
+        ylabel="us (lower is better)",
+        plot_name="NVFP4 Input Quantization Latency (us)",
+        args={},
+    )
+)
+def benchmark(batch_size, provider, N, K):
+    M = batch_size
+    device = "cuda"
+    dtype = torch.bfloat16
+
+    # Create input tensor
+    a = torch.randn((M, K), device=device, dtype=dtype)
+
+    # Compute global scale for activation
+    a_global_scale = compute_global_scale(a)
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    cfg = PROVIDER_CFGS[provider]
+
+    if cfg["backend"] == "vllm":
+        # vLLM's FP4 quantization
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: ops.scaled_fp4_quant(a, a_global_scale),
+            quantiles=quantiles,
+        )
+    elif cfg["backend"] == "flashinfer":
+        # FlashInfer's FP4 quantization
+        # Use is_sf_swizzled_layout=True to match vLLM's output format
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: flashinfer_fp4_quantize(
+                a, a_global_scale, is_sf_swizzled_layout=True
+            ),
+            quantiles=quantiles,
+        )
+
+    # Convert ms to us for better readability at small batch sizes
+    to_us = lambda t_ms: t_ms * 1000
+    return to_us(ms), to_us(max_ms), to_us(min_ms)
+
+
+def prepare_shapes(args):
+    out = []
+    for model, tp_size in itertools.product(args.models, args.tp_sizes):
+        for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+            KN[tp_dim] //= tp_size
+            KN.append(model)
+            out.append(KN)
+    return out
+
+
+def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
+    """Test accuracy between vLLM and FlashInfer FP4 quantization."""
+    # Create input tensor
+    a = torch.randn((M, K), device=device, dtype=dtype)
+
+    # Compute global scale
+    a_global_scale = compute_global_scale(a)
+
+    # vLLM quantization
+    vllm_fp4, vllm_scale = ops.scaled_fp4_quant(a, a_global_scale)
+
+    # FlashInfer quantization (with swizzled layout to match vLLM's output)
+    flashinfer_fp4, flashinfer_scale = flashinfer_fp4_quantize(
+        a, a_global_scale, is_sf_swizzled_layout=True
+    )
+    flashinfer_scale = flashinfer_scale.view(torch.float8_e4m3fn)
+
+    # Compare outputs
+    torch.testing.assert_close(
+        vllm_fp4,
+        flashinfer_fp4,
+    )
+    print(f"M={M}, K={K}, dtype={dtype}: PASSED")
+
+
+def test_accuracy():
+    """Run accuracy tests across various shapes."""
+    print("\n" + "=" * 60)
+    print("Running accuracy tests: vLLM vs FlashInfer")
+    print("=" * 60)
+
+    device = "cuda"
+    dtype = torch.bfloat16
+
+    # Test various batch sizes and hidden dimensions
+    Ms = [1, 1024]
+    Ks = [4096]
+
+    for M in Ms:
+        for K in Ks:
+            _test_accuracy_once(M, K, dtype, device)
+
+    print("\nAll accuracy tests passed!")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Benchmark NVFP4 quantization: vLLM vs FlashInfer"
+    )
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=["meta-llama/Llama-3.1-8B-Instruct"],
+        choices=list(WEIGHT_SHAPES.keys()),
+    )
+    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
+    parser.add_argument(
+        "--save-path",
+        type=str,
+        default=None,
+        help="Path to save benchmark results",
+    )
+    parser.add_argument(
+        "--accuracy",
+        action="store_true",
+        help="Run accuracy tests",
+    )
+    args = parser.parse_args()
+
+    if args.accuracy:
+        test_accuracy()
+
+    for K, N, model in prepare_shapes(args):
+        print(f"\n{model}, N={N} K={K}")
+        benchmark.run(
+            print_data=True,
+            save_path=args.save_path,
+            N=N,
+            K=K,
+        )
+
+    print("\nBenchmark finished!")
--- a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
@ -0,0 +1,244 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from enum import Enum
+from itertools import product
+from typing import Any
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    _per_token_group_quant_fp8_colmajor,
+    silu_mul_per_token_group_quant_fp8_colmajor,
+)
+from vllm.triton_utils import triton
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+
+from .utils import ArgPool, Bench, CudaGraphBenchParams
+
+GROUP_SIZE = 128
+FLOAT8_T = torch.float8_e4m3fn
+
+
+def print_timers(timers: list[TMeasurement], cuda_graph_nops: int):
+    print(
+        f"Note : The timings reported above is for {cuda_graph_nops} "
+        "consecutive invocations of the benchmarking functions. "
+        f"Please divide by {cuda_graph_nops} for single invocation "
+        "timings."
+    )
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+class ImplType(Enum):
+    SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR = 1
+    REFERENCE = 2
+
+    def get_impl(self):
+        if self == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
+            return silu_mul_per_token_group_quant_fp8_colmajor
+        elif self == ImplType.REFERENCE:
+            return reference
+        raise ValueError(f"Unrecognized ImplType {self}")
+
+
+@dataclass
+class BenchmarkTensors:
+    input: torch.Tensor
+    output: torch.Tensor
+
+    # Reference act output tensor
+    ref_act_out: torch.Tensor
+    ref_quant_out: torch.Tensor
+
+    @staticmethod
+    def make(T: int, N: int) -> "BenchmarkTensors":
+        assert T % GROUP_SIZE == 0
+        assert N % (GROUP_SIZE * 2) == 0
+
+        input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
+
+        # silu_mul_per_token_group_quant_fp8_colmajor output.
+        output = torch.rand((T, N // 2), dtype=torch.bfloat16, device="cuda").to(
+            FLOAT8_T
+        )
+
+        # reference output.
+        ref_act_out = torch.empty((T, N // 2), dtype=torch.bfloat16, device="cuda")
+        ref_quant_out = torch.empty(
+            (T, N // 2), dtype=torch.bfloat16, device="cuda"
+        ).to(FLOAT8_T)
+
+        return BenchmarkTensors(
+            input=input,
+            output=output,
+            ref_act_out=ref_act_out,
+            ref_quant_out=ref_quant_out,
+        )
+
+    @property
+    def T(self):
+        return self.input.size(0)
+
+    @property
+    def N(self):
+        return self.input.size(1)
+
+    def make_impl_kwargs(self, impl_type: ImplType) -> dict[str, Any]:
+        if impl_type == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
+            return {
+                "input": self.input,
+                "output": self.output,
+                "use_ue8m0": is_deep_gemm_e8m0_used(),
+            }
+        elif impl_type == ImplType.REFERENCE:
+            return {
+                "input": self.input,
+                "act_out": self.ref_act_out,
+                "quant_out": self.ref_quant_out,
+                "use_ue8m0": is_deep_gemm_e8m0_used(),
+            }
+        raise ValueError(f"Unrecognized impl_type {impl_type}")
+
+
+def reference_quant(x: torch.Tensor, quant_out: torch.Tensor, use_ue8m0: bool):
+    """
+    Reference triton quant kernel from,
+    vllm.model_executor.layers.quantization.utils.fp8_utils
+    """
+    assert quant_out.size() == x.size()
+    # Allocate the scale tensor column-major format.
+    shape = (x.shape[-1] // GROUP_SIZE,) + x.shape[:-1]
+    x_q = quant_out
+    x_s = torch.empty(shape, device=x.device, dtype=torch.float32).permute(-1, -2)
+
+    M = x.numel() // GROUP_SIZE
+    N = GROUP_SIZE
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+    num_stages = 1
+
+    finfo = torch.finfo(FLOAT8_T)
+    fp8_min = finfo.min
+    fp8_max = finfo.max
+
+    _per_token_group_quant_fp8_colmajor[(M,)](
+        x,
+        x_q,
+        x_s,
+        GROUP_SIZE,
+        x.shape[1],
+        x.stride(0),
+        x_s.stride(1),
+        eps=1e-10,
+        fp8_min=fp8_min,
+        fp8_max=fp8_max,
+        use_ue8m0=use_ue8m0,
+        BLOCK=BLOCK,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    return x_q, x_s
+
+
+def reference(
+    input: torch.Tensor,
+    act_out: torch.Tensor,
+    quant_out: torch.Tensor,
+    use_ue8m0: bool,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    torch.ops._C.silu_and_mul(act_out, input)
+    return reference_quant(act_out, quant_out, use_ue8m0)
+
+
+def bench_impl(
+    bench_tensors: list[BenchmarkTensors], impl_type: ImplType
+) -> TMeasurement:
+    T = bench_tensors[0].T
+    N = bench_tensors[0].N
+
+    arg_pool_size = len(bench_tensors)
+    kwargs_list = [bt.make_impl_kwargs(impl_type) for bt in bench_tensors]
+
+    # warmup
+    for kwargs in kwargs_list:
+        impl_type.get_impl()(**kwargs)
+    torch.cuda.synchronize()
+
+    # Merge into a single kwargs and qualify arguments as ArgPool
+    kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
+    for _kwargs in kwargs_list:
+        for k, v in _kwargs.items():
+            kwargs[k].values.append(v)
+
+    cuda_graph_params = None
+    cuda_graph_params = CudaGraphBenchParams(arg_pool_size)
+    timer = None
+    with Bench(
+        cuda_graph_params,
+        "silu-mul-quant",
+        f"num_tokens={T}, N={N}",
+        impl_type.name,
+        impl_type.get_impl(),
+        **kwargs,
+    ) as bench:
+        timer = bench.run()
+    return timer
+
+
+def test_correctness(T: int, N: int):
+    print(f"Testing num_tokens={T}, N={N} ...")
+
+    bench_tensor = BenchmarkTensors.make(T, N)
+
+    def output_from_impl(impl: ImplType) -> tuple[torch.Tensor, torch.Tensor]:
+        return impl.get_impl()(**bench_tensor.make_impl_kwargs(impl))
+
+    # reference output
+    ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
+
+    # test ouptut
+    out_q, out_s = output_from_impl(
+        ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
+    )
+
+    torch.testing.assert_close(ref_out_q.to(torch.float32), out_q.to(torch.float32))
+    torch.testing.assert_close(ref_out_s, out_s)
+
+
+def run(Ts: list[int], Ns: list[int], arg_pool_size: int) -> list[TMeasurement]:
+    timers = []
+    for N, T in product(Ns, Ts):
+        test_correctness(T, N)
+
+        bench_tensors: list[BenchmarkTensors] = [
+            BenchmarkTensors.make(T, N) for _ in range(arg_pool_size)
+        ]
+
+        silu_mul_quant_timer = bench_impl(
+            bench_tensors, ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
+        )
+        timers.append(silu_mul_quant_timer)
+        reference_timer = bench_impl(bench_tensors, ImplType.REFERENCE)
+        timers.append(reference_timer)
+
+        print_timers(
+            [silu_mul_quant_timer, reference_timer], cuda_graph_nops=arg_pool_size
+        )
+
+    print_timers(timers, cuda_graph_nops=arg_pool_size)
+
+    return timers
+
+
+if __name__ == "__main__":
+    T = [128 * i for i in range(1, 16)] + [2048 * i for i in range(1, 65)]
+    N = [2048, 4096, 8192]
+
+    print(f"T = {T}, N = {N}")
+    run(T, N, arg_pool_size=8)
--- a/benchmarks/kernels/benchmark_activation.py
+++ b/benchmarks/kernels/benchmark_activation.py
@ -13,8 +13,8 @@ from vllm.triton_utils import triton
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE

-batch_size_range = [1, 16, 32, 64, 128]
-seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
+batch_size_range = [1, 16, 128]
+seq_len_range = [1, 16, 64, 1024, 4096]
 intermediate_size = [3072, 9728, 12288]
 configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))

--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@ -237,6 +237,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
            b_q_weight=w_q,
            b_bias=None,
            b_scales=w_s,
+            a_scales=None,
            global_scale=None,
            b_zeros=w_zp,
            g_idx=g_idx,
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@ -263,7 +263,7 @@ def bench_run(

    results.append(
        benchmark.Timer(
-            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
+            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
@ -273,7 +273,7 @@ def bench_run(

    results.append(
        benchmark.Timer(
-            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
+            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
--- a/benchmarks/kernels/benchmark_mla_k_concat.py
+++ b/benchmarks/kernels/benchmark_mla_k_concat.py
@ -0,0 +1,150 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark script comparing torch.cat vs direct copy for k_nope/k_pe concatenation
+in MLA (Multi-head Latent Attention) prefill.
+
+This validates that the optimization from commit 8d4142bd is beneficial across
+various batch sizes, not just the originally tested batch size of 32768.
+"""
+
+import time
+from collections.abc import Callable
+
+import torch
+
+# DeepSeek-V3 MLA dimensions
+NUM_HEADS = 128
+QK_NOPE_HEAD_DIM = 128
+PE_DIM = 64
+
+
+def cat_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
+    """Original torch.cat approach with expand."""
+    return torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
+
+
+def direct_copy_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
+    """Optimized direct copy approach (avoids expand + cat overhead)."""
+    k = torch.empty(
+        (*k_nope.shape[:-1], k_nope.shape[-1] + k_pe.shape[-1]),
+        dtype=k_nope.dtype,
+        device=k_nope.device,
+    )
+    k[..., : k_nope.shape[-1]] = k_nope
+    k[..., k_nope.shape[-1] :] = k_pe
+    return k
+
+
+def benchmark_method(
+    method: Callable,
+    k_nope: torch.Tensor,
+    k_pe: torch.Tensor,
+    num_warmup: int = 10,
+    num_iters: int = 100,
+) -> float:
+    """Benchmark a concatenation method and return mean latency in ms."""
+    # Warmup
+    for _ in range(num_warmup):
+        _ = method(k_nope, k_pe)
+    torch.cuda.synchronize()
+
+    # Benchmark
+    start = time.perf_counter()
+    for _ in range(num_iters):
+        _ = method(k_nope, k_pe)
+    torch.cuda.synchronize()
+    end = time.perf_counter()
+
+    return (end - start) / num_iters * 1000  # Convert to ms
+
+
+@torch.inference_mode()
+def run_benchmark(dtype: torch.dtype, dtype_name: str):
+    """Run benchmark for a specific dtype."""
+    torch.set_default_device("cuda")
+
+    # Batch sizes to test (powers of 2 from 32 to 65536)
+    batch_sizes = [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536]
+
+    print("=" * 80)
+    print("Benchmark: torch.cat vs direct copy for MLA k_nope/k_pe concatenation")
+    print("=" * 80)
+    print(
+        f"Tensor shapes: k_nope=[B, {NUM_HEADS}, {QK_NOPE_HEAD_DIM}], "
+        f"k_pe=[B, 1, {PE_DIM}]"
+    )
+    print(f"dtype: {dtype_name}")
+    print()
+    print(
+        f"{'Batch Size':>12} | {'cat (ms)':>10} | {'direct (ms)':>12} | "
+        f"{'Speedup':>8} | {'Reduction':>10}"
+    )
+    print("-" * 70)
+
+    results = []
+    for batch_size in batch_sizes:
+        # Create input tensors (generate in float32 then convert for FP8 compatibility)
+        k_nope = torch.randn(
+            batch_size, NUM_HEADS, QK_NOPE_HEAD_DIM, dtype=torch.float32, device="cuda"
+        ).to(dtype)
+        k_pe = torch.randn(
+            batch_size, 1, PE_DIM, dtype=torch.float32, device="cuda"
+        ).to(dtype)
+
+        # Benchmark both methods
+        cat_time = benchmark_method(cat_method, k_nope, k_pe)
+        direct_time = benchmark_method(direct_copy_method, k_nope, k_pe)
+
+        speedup = cat_time / direct_time
+        reduction = (1 - direct_time / cat_time) * 100
+
+        results.append((batch_size, cat_time, direct_time, speedup, reduction))
+
+        print(
+            f"{batch_size:>12} | {cat_time:>10.3f} | {direct_time:>12.3f} | "
+            f"{speedup:>7.2f}x | {reduction:>9.1f}%"
+        )
+
+    print("=" * 80)
+
+    # Summary statistics
+    speedups = [r[3] for r in results]
+    print("\nSpeedup summary:")
+    print(f"  Min:  {min(speedups):.2f}x")
+    print(f"  Max:  {max(speedups):.2f}x")
+    print(f"  Mean: {sum(speedups) / len(speedups):.2f}x")
+
+    # Find crossover point
+    crossover_batch = None
+    for batch_size, _, _, speedup, _ in results:
+        if speedup >= 1.0:
+            crossover_batch = batch_size
+            break
+
+    print("\nConclusion:")
+    if crossover_batch:
+        print(f"  - Direct copy becomes beneficial at batch size >= {crossover_batch}")
+    # Filter for large batches (>= 512 which is typical for prefill)
+    large_batch_speedups = [r[3] for r in results if r[0] >= 512]
+    if large_batch_speedups:
+        avg_large = sum(large_batch_speedups) / len(large_batch_speedups)
+        print(f"  - For batch sizes >= 512: avg speedup = {avg_large:.2f}x")
+    print("  - MLA prefill typically uses large batches, so optimization is effective")
+
+    return results
+
+
+@torch.inference_mode()
+def main():
+    # Test bfloat16
+    print("\n")
+    run_benchmark(torch.bfloat16, "bfloat16")
+
+    # Test float8_e4m3fn
+    print("\n")
+    run_benchmark(torch.float8_e4m3fn, "float8_e4m3fn")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/kernels/benchmark_moe_align_block_size.py
+++ b/benchmarks/kernels/benchmark_moe_align_block_size.py
@ -24,12 +24,15 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
 num_tokens_range = [1, 16, 256, 4096]
 num_experts_range = [16, 64, 224, 256, 280, 512]
 topk_range = [1, 2, 8]
-configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
+ep_size_range = [1, 8]
+configs = list(
+    itertools.product(num_tokens_range, num_experts_range, topk_range, ep_size_range)
+)


@triton.testing.perf_report(
    triton.testing.Benchmark(
-        x_names=["num_tokens", "num_experts", "topk"],
+        x_names=["num_tokens", "num_experts", "topk", "ep_size"],
        x_vals=configs,
        line_arg="provider",
        line_vals=["vllm"],
@ -38,16 +41,26 @@ configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range
        args={},
    )
 )
-def benchmark(num_tokens, num_experts, topk, provider):
+def benchmark(num_tokens, num_experts, topk, ep_size, provider):
    """Benchmark function for Triton."""
    block_size = 256
+    torch.cuda.manual_seed_all(0)
    topk_ids = get_topk_ids(num_tokens, num_experts, topk)

+    e_map = None
+    if ep_size != 1:
+        local_e = num_experts // ep_size
+        e_ids = torch.randperm(num_experts, device="cuda", dtype=torch.int32)[:local_e]
+        e_map = torch.full((num_experts,), -1, device="cuda", dtype=torch.int32)
+        e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
+
    quantiles = [0.5, 0.2, 0.8]

    if provider == "vllm":
        ms, min_ms, max_ms = triton.testing.do_bench(
-            lambda: moe_align_block_size(topk_ids, block_size, num_experts),
+            lambda: moe_align_block_size(
+                topk_ids, block_size, num_experts, e_map, ignore_invalid_experts=True
+            ),
            quantiles=quantiles,
        )

--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@ -99,7 +99,6 @@ def benchmark_mrope(
    # the parameters to compute the q k v size based on tp_size
    mrope_helper_class = get_rope(
        head_size=head_dim,
-        rotary_dim=head_dim,
        max_position=max_position,
        is_neox_style=is_neox_style,
        rope_parameters=rope_parameters,
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@ -32,8 +32,8 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device):
    def benchmark(batch_size, seq_len, num_heads, provider):
        dtype = torch.bfloat16
        max_position = 8192
-        base = 10000
-        rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
+        rope_parameters = {"partial_rotary_factor": rotary_dim / head_size}
+        rope = get_rope(head_size, max_position, is_neox_style, rope_parameters)
        rope = rope.to(dtype=dtype, device=device)
        cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device)

--- a/benchmarks/kernels/deepgemm/README.md
+++ b/benchmarks/kernels/deepgemm/README.md
@ -2,7 +2,7 @@

 This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels.

-Currently this just includes dense GEMMs and only works on Hopper GPUs.
+Currently, this just includes dense GEMMs and only works on Hopper GPUs.

 ## Setup

--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -251,17 +251,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
        endif()

        # Build ACL with CMake
-        set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
-        set(CMAKE_BUILD_TYPE "Release")
-        set(ARM_COMPUTE_ARCH "armv8.2-a")
-        set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
-        set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
-        set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
-        set(ARM_COMPUTE_ENABLE_OPENMP "ON")
-        set(ARM_COMPUTE_ENABLE_WERROR "OFF")
-        set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
-        set(ARM_COMPUTE_BUILD_TESTING "OFF")
-
        set(_cmake_config_cmd
             ${CMAKE_COMMAND} -G Ninja -B build 
            -DARM_COMPUTE_BUILD_SHARED_LIB=OFF 
@ -341,7 +330,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
        PUBLIC ${oneDNN_BINARY_DIR}/include
        PRIVATE ${oneDNN_SOURCE_DIR}/src
    )
-    target_link_libraries(dnnl_ext dnnl)
+    target_link_libraries(dnnl_ext dnnl torch)
    target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC)
    list(APPEND LIBS dnnl_ext)
    set(USE_ONEDNN ON)
@ -369,13 +358,13 @@ set(VLLM_EXT_SRC
    "csrc/cpu/pos_encoding.cpp"
    "csrc/moe/dynamic_4bit_int_moe_cpu.cpp"
    "csrc/cpu/cpu_attn.cpp"
-    "csrc/cpu/scratchpad_manager.cpp"
    "csrc/cpu/torch_bindings.cpp")

 if (AVX512_FOUND AND NOT AVX512_DISABLED)
    set(VLLM_EXT_SRC
        "csrc/cpu/shm.cpp"
        "csrc/cpu/cpu_wna16.cpp"
+        "csrc/cpu/cpu_fused_moe.cpp"
        ${VLLM_EXT_SRC})
    if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
        set(VLLM_EXT_SRC
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@ -35,16 +35,21 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
 # sm90a

 set(SUPPORT_ARCHS)
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3)
-    list(APPEND SUPPORT_ARCHS 9.0a)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3)
+    list(APPEND SUPPORT_ARCHS "9.0a")
 endif()
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8)
-    list(APPEND SUPPORT_ARCHS 10.0a)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
+    # CUDA 12.9 has introduced "Family-Specific Architecture Features"
+    # this supports all compute_10x family
+    list(APPEND SUPPORT_ARCHS "10.0f")
+elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+    list(APPEND SUPPORT_ARCHS "10.0a")
 endif()


 cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
 if(FLASH_MLA_ARCHS)
+    message(STATUS "FlashMLA CUDA architectures: ${FLASH_MLA_ARCHS}")
    set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
    list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")

@ -126,7 +131,8 @@ if(FLASH_MLA_ARCHS)
        $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
        $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
 else()
-    # Create empty targets for setup.py when not targeting sm90a systems
+    message(STATUS "FlashMLA will not compile: unsupported CUDA architecture ${CUDA_ARCHS}")
+    # Create empty targets for setup.py on unsupported systems
    add_custom_target(_flashmla_C)
    add_custom_target(_flashmla_extension_C)
 endif()
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 71bb26f6295449be880344b93b51791cc009237d
+          GIT_TAG 86f8f157cf82aa2342743752b97788922dd7de43
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@ -140,16 +140,21 @@ function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR)
  run_python(_VLLM_TORCH_GOMP_PATH
    "
 import os, glob
-try:
-  import torch
-  torch_pkg = os.path.dirname(torch.__file__)
-  site_root = os.path.dirname(torch_pkg)
-  torch_libs = os.path.join(site_root, 'torch.libs')
-  print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0])
-except:
-  print('')
+import torch
+torch_pkg = os.path.dirname(torch.__file__)
+site_root = os.path.dirname(torch_pkg)
+
+# Search both torch.libs and torch/lib
+roots = [os.path.join(site_root, 'torch.libs'), os.path.join(torch_pkg, 'lib')]
+candidates = []
+for root in roots:
+    if not os.path.isdir(root):
+        continue
+    candidates.extend(glob.glob(os.path.join(root, 'libgomp*.so*')))
+
+print(candidates[0] if candidates else '')
 "
-    "failed to probe torch.libs for libgomp")
+    "failed to probe for libgomp")

  if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
    return()
@ -495,7 +500,13 @@ function (define_extension_target MOD_NAME)
    set(SOABI_KEYWORD "")
  endif()

-  if (ARG_USE_SABI)
+  run_python(IS_FREETHREADED_PYTHON
+    "import sysconfig; print(1 if sysconfig.get_config_var(\"Py_GIL_DISABLED\") else 0)"
+    "Failed to determine whether interpreter is free-threaded")
+
+  # Free-threaded Python doesn't yet support the stable ABI (see PEP 803/809),
+  # so avoid using the stable ABI under free-threading only.
+  if (ARG_USE_SABI AND NOT IS_FREETHREADED_PYTHON)
    Python_add_library(${MOD_NAME} MODULE USE_SABI ${ARG_USE_SABI} ${SOABI_KEYWORD} "${ARG_SOURCES}")
  else()
    Python_add_library(${MOD_NAME} MODULE ${SOABI_KEYWORD} "${ARG_SOURCES}")
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@ -15,19 +15,61 @@ __device__ __forceinline__ scalar_t compute(const scalar_t& x,
                                            const scalar_t& y) {
  return act_first ? ACT_FN(x) * y : x * ACT_FN(y);
 }
-// Activation and gating kernel template.

+// Check if all pointers are 16-byte aligned for int4 vectorized access
+__device__ __forceinline__ bool is_16byte_aligned(const void* ptr) {
+  return (reinterpret_cast<uintptr_t>(ptr) & 15) == 0;
+}
+
+// Activation and gating kernel template.
 template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
          bool act_first>
 __global__ void act_and_mul_kernel(
    scalar_t* __restrict__ out,          // [..., d]
    const scalar_t* __restrict__ input,  // [..., 2, d]
    const int d) {
+  constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
  const int64_t token_idx = blockIdx.x;
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
-    const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
-    out[token_idx * d + idx] = compute<scalar_t, ACT_FN, act_first>(x, y);
+  const scalar_t* x_ptr = input + token_idx * 2 * d;
+  const scalar_t* y_ptr = x_ptr + d;
+  scalar_t* out_ptr = out + token_idx * d;
+
+  // Check alignment for 128-bit vectorized access.
+  // All three pointers must be 16-byte aligned for safe int4 operations.
+  const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) &&
+                       is_16byte_aligned(out_ptr);
+
+  if (aligned && d >= VEC_SIZE) {
+    // Fast path: 128-bit vectorized loop
+    const int4* x_vec = reinterpret_cast<const int4*>(x_ptr);
+    const int4* y_vec = reinterpret_cast<const int4*>(y_ptr);
+    int4* out_vec = reinterpret_cast<int4*>(out_ptr);
+    const int num_vecs = d / VEC_SIZE;
+    const int vec_end = num_vecs * VEC_SIZE;
+
+    for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
+      int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r;
+      auto* xp = reinterpret_cast<scalar_t*>(&x);
+      auto* yp = reinterpret_cast<scalar_t*>(&y);
+      auto* rp = reinterpret_cast<scalar_t*>(&r);
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; j++) {
+        rp[j] = compute<scalar_t, ACT_FN, act_first>(xp[j], yp[j]);
+      }
+      out_vec[i] = r;
+    }
+    // Scalar cleanup for remaining elements
+    for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+      out_ptr[i] = compute<scalar_t, ACT_FN, act_first>(VLLM_LDG(&x_ptr[i]),
+                                                        VLLM_LDG(&y_ptr[i]));
+    }
+  } else {
+    // Scalar fallback for unaligned data or small d
+    for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+      const scalar_t x = VLLM_LDG(&x_ptr[idx]);
+      const scalar_t y = VLLM_LDG(&y_ptr[idx]);
+      out_ptr[idx] = compute<scalar_t, ACT_FN, act_first>(x, y);
+    }
  }
 }

@ -120,50 +162,115 @@ template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&, const float)>
 __global__ void act_and_mul_kernel_with_param(
    scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d,
    const float param) {
+  constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
  const int64_t token_idx = blockIdx.x;
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
-    const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
-    out[token_idx * d + idx] = ACT_FN(x, param) * y;
+  const scalar_t* x_ptr = input + token_idx * 2 * d;
+  const scalar_t* y_ptr = x_ptr + d;
+  scalar_t* out_ptr = out + token_idx * d;
+
+  // Check alignment for 128-bit vectorized access
+  const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) &&
+                       is_16byte_aligned(out_ptr);
+
+  if (aligned && d >= VEC_SIZE) {
+    // Fast path: 128-bit vectorized loop
+    const int4* x_vec = reinterpret_cast<const int4*>(x_ptr);
+    const int4* y_vec = reinterpret_cast<const int4*>(y_ptr);
+    int4* out_vec = reinterpret_cast<int4*>(out_ptr);
+    const int num_vecs = d / VEC_SIZE;
+    const int vec_end = num_vecs * VEC_SIZE;
+
+    for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
+      int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r;
+      auto* xp = reinterpret_cast<scalar_t*>(&x);
+      auto* yp = reinterpret_cast<scalar_t*>(&y);
+      auto* rp = reinterpret_cast<scalar_t*>(&r);
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; j++) {
+        rp[j] = ACT_FN(xp[j], param) * yp[j];
+      }
+      out_vec[i] = r;
+    }
+    // Scalar cleanup for remaining elements
+    for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+      out_ptr[i] = ACT_FN(VLLM_LDG(&x_ptr[i]), param) * VLLM_LDG(&y_ptr[i]);
+    }
+  } else {
+    // Scalar fallback for unaligned data or small d
+    for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+      const scalar_t x = VLLM_LDG(&x_ptr[idx]);
+      const scalar_t y = VLLM_LDG(&y_ptr[idx]);
+      out_ptr[idx] = ACT_FN(x, param) * y;
+    }
  }
 }

 template <typename T>
 __device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up,
                                               float alpha, float limit) {
-  // clamp gate: min=None, max=limit
-  const float gate_f = (float)gate;
-  const float clamped_gate = gate_f > limit ? limit : gate_f;
-
-  // clamp up: min=-limit, max=limit
-  const float up_f = (float)up;
-  const float clamped_up =
-      up_f > limit ? limit : (up_f < -limit ? -limit : up_f);
-
-  // glu = gate * sigmoid(gate * alpha)
-  const float sigmoid_val = 1.0f / (1.0f + expf(-clamped_gate * alpha));
-  const float glu = clamped_gate * sigmoid_val;
-
-  // (up + 1) * glu
-  return (T)((clamped_up + 1.0f) * glu);
+  // Clamp gate to (-inf, limit] and up to [-limit, limit]
+  const float g = fminf((float)gate, limit);
+  const float u = fmaxf(fminf((float)up, limit), -limit);
+  // glu = gate * sigmoid(gate * alpha), then return (up + 1) * glu
+  return (T)((u + 1.0f) * g / (1.0f + expf(-g * alpha)));
 }

+// Interleaved gate/up: input has [gate0, up0, gate1, up1, ...].
 template <typename scalar_t,
          scalar_t (*ACT_FN)(const scalar_t&, const scalar_t&, const float,
                             const float)>
 __global__ void swigluoai_and_mul_kernel(
    scalar_t* __restrict__ out,          // [..., d]
-    const scalar_t* __restrict__ input,  // [..., 2, d]
+    const scalar_t* __restrict__ input,  // [..., 2 * d] (interleaved)
    const int d, const float alpha, const float limit) {
+  // For interleaved data: input has 2*d elements per token (gate/up pairs)
+  // output has d elements per token
+  constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
+  constexpr int PAIRS = VEC_SIZE / 2;  // Number of gate/up pairs per int4 load
  const int64_t token_idx = blockIdx.x;
-  // TODO: Vectorize loads and stores.
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    // gate = x[..., ::2]  (even indices)
-    const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx]);
-    // up = x[..., 1::2]   (odd indices)
-    const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx + 1]);
+  const scalar_t* in_ptr = input + token_idx * 2 * d;
+  scalar_t* out_ptr = out + token_idx * d;

-    out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit);
+  // Check alignment for 128-bit vectorized access on input.
+  // For output we use int2 (64-bit) which has 8-byte alignment requirement.
+  const bool in_aligned = is_16byte_aligned(in_ptr);
+  const bool out_aligned =
+      (reinterpret_cast<uintptr_t>(out_ptr) & 7) == 0;  // 8-byte for int2
+
+  if (in_aligned && out_aligned && d >= PAIRS) {
+    // Fast path: vectorized loop
+    // Each int4 load gives VEC_SIZE elements = PAIRS gate/up pairs
+    // Each int2 store writes PAIRS output elements
+    const int4* in_vec = reinterpret_cast<const int4*>(in_ptr);
+    int2* out_vec = reinterpret_cast<int2*>(out_ptr);
+    const int num_vecs = d / PAIRS;
+    const int vec_end = num_vecs * PAIRS;
+
+    for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
+      int4 v = VLLM_LDG(&in_vec[i]);
+      int2 r;
+      auto* vp = reinterpret_cast<scalar_t*>(&v);
+      auto* rp = reinterpret_cast<scalar_t*>(&r);
+#pragma unroll
+      for (int j = 0; j < PAIRS; j++) {
+        rp[j] = ACT_FN(vp[2 * j], vp[2 * j + 1], alpha, limit);
+      }
+      out_vec[i] = r;
+    }
+    // Scalar cleanup for remaining elements
+    for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+      out_ptr[i] = ACT_FN(VLLM_LDG(&in_ptr[2 * i]),
+                          VLLM_LDG(&in_ptr[2 * i + 1]), alpha, limit);
+    }
+  } else {
+    // Scalar fallback for unaligned data or small d
+    for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+      // gate = x[..., ::2]  (even indices)
+      const scalar_t gate = VLLM_LDG(&in_ptr[2 * idx]);
+      // up = x[..., 1::2]   (odd indices)
+      const scalar_t up = VLLM_LDG(&in_ptr[2 * idx + 1]);
+      out_ptr[idx] = ACT_FN(gate, up, alpha, limit);
+    }
  }
 }

@ -217,10 +324,41 @@ __global__ void activation_kernel(
    scalar_t* __restrict__ out,          // [..., d]
    const scalar_t* __restrict__ input,  // [..., d]
    const int d) {
+  constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
  const int64_t token_idx = blockIdx.x;
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
-    out[token_idx * d + idx] = ACT_FN(x);
+  const scalar_t* in_ptr = input + token_idx * d;
+  scalar_t* out_ptr = out + token_idx * d;
+
+  // Check alignment for 128-bit vectorized access
+  const bool aligned = is_16byte_aligned(in_ptr) && is_16byte_aligned(out_ptr);
+
+  if (aligned && d >= VEC_SIZE) {
+    // Fast path: 128-bit vectorized loop
+    const int4* in_vec = reinterpret_cast<const int4*>(in_ptr);
+    int4* out_vec = reinterpret_cast<int4*>(out_ptr);
+    const int num_vecs = d / VEC_SIZE;
+    const int vec_end = num_vecs * VEC_SIZE;
+
+    for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
+      int4 v = VLLM_LDG(&in_vec[i]), r;
+      auto* vp = reinterpret_cast<scalar_t*>(&v);
+      auto* rp = reinterpret_cast<scalar_t*>(&r);
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; j++) {
+        rp[j] = ACT_FN(vp[j]);
+      }
+      out_vec[i] = r;
+    }
+    // Scalar cleanup for remaining elements
+    for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+      out_ptr[i] = ACT_FN(VLLM_LDG(&in_ptr[i]));
+    }
+  } else {
+    // Scalar fallback for unaligned data or small d
+    for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+      const scalar_t x = VLLM_LDG(&in_ptr[idx]);
+      out_ptr[idx] = ACT_FN(x);
+    }
  }
 }

--- a/csrc/attention/merge_attn_states.cu
+++ b/csrc/attention/merge_attn_states.cu
@ -16,7 +16,8 @@ __global__ void merge_attn_states_kernel(
    scalar_t* output, float* output_lse, const scalar_t* prefix_output,
    const float* prefix_lse, const scalar_t* suffix_output,
    const float* suffix_lse, const uint num_tokens, const uint num_heads,
-    const uint head_size) {
+    const uint head_size, const uint prefix_head_stride,
+    const uint output_head_stride) {
  using pack_128b_t = uint4;
  const uint pack_size = 16 / sizeof(scalar_t);
  const uint threads_per_head = head_size / pack_size;
@ -34,11 +35,13 @@ __global__ void merge_attn_states_kernel(
  const uint head_idx = token_head_idx % num_heads;

  const uint pack_offset = pack_idx * pack_size;  // (0~15)*8, etc.
-  const uint head_offset =
-      token_idx * num_heads * head_size + head_idx * head_size;
-  const scalar_t* prefix_head_ptr = prefix_output + head_offset;
-  const scalar_t* suffix_head_ptr = suffix_output + head_offset;
-  scalar_t* output_head_ptr = output + head_offset;
+  const uint src_head_offset = token_idx * num_heads * prefix_head_stride +
+                               head_idx * prefix_head_stride;
+  const uint dst_head_offset = token_idx * num_heads * output_head_stride +
+                               head_idx * output_head_stride;
+  const scalar_t* prefix_head_ptr = prefix_output + src_head_offset;
+  const scalar_t* suffix_head_ptr = suffix_output + src_head_offset;
+  scalar_t* output_head_ptr = output + dst_head_offset;

  float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
  float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
@ -140,7 +143,7 @@ __global__ void merge_attn_states_kernel(
            reinterpret_cast<float*>(prefix_lse.data_ptr()),                \
            reinterpret_cast<scalar_t*>(suffix_output.data_ptr()),          \
            reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens,    \
-            num_heads, head_size);                                          \
+            num_heads, head_size, prefix_head_stride, output_head_stride);  \
  }

 /*@brief Merges the attention states from prefix and suffix
@ -166,17 +169,11 @@ void merge_attn_states_launcher(torch::Tensor& output,
  const uint num_tokens = output.size(0);
  const uint num_heads = output.size(1);
  const uint head_size = output.size(2);
+  const uint prefix_head_stride = prefix_output.stride(1);
+  const uint output_head_stride = output.stride(1);
  const uint pack_size = 16 / sizeof(scalar_t);
  TORCH_CHECK(head_size % pack_size == 0,
              "headsize must be multiple of pack_size:", pack_size);
-  TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1,
-              "output heads must be contiguous in memory");
-  TORCH_CHECK(
-      prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1,
-      "prefix_output heads must be contiguous in memory");
-  TORCH_CHECK(
-      suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1,
-      "suffix_output heads must be contiguous in memory");
  float* output_lse_ptr = nullptr;
  if (output_lse.has_value()) {
    output_lse_ptr = output_lse.value().data_ptr<float>();
--- a/csrc/cache.h
+++ b/csrc/cache.h
@ -1,6 +1,7 @@
 #pragma once

 #include <torch/all.h>
+#include <c10/util/Optional.h>

 #include <map>
 #include <vector>
@ -8,16 +9,6 @@
 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
                 const torch::Tensor& block_mapping);

-// Note: the key_caches and value_caches vectors are constant but
-// not the Tensors they contain. The vectors need to be const refs
-// in order to satisfy pytorch's C++ operator registration code.
-void copy_blocks(std::vector<torch::Tensor> const& key_caches,
-                 std::vector<torch::Tensor> const& value_caches,
-                 const torch::Tensor& block_mapping);
-
-void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
-                     const torch::Tensor& block_mapping);
-
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                       torch::Tensor& key_cache, torch::Tensor& value_cache,
                       torch::Tensor& slot_mapping,
@ -41,11 +32,12 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                 const double scale, const std::string& kv_cache_dtype);

 void gather_and_maybe_dequant_cache(
-    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
-    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
-    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
-    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
-    int64_t batch_size, const std::string& kv_cache_dtype,
+    torch::Tensor const& src_cache,     // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& dst,           // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& block_table,   // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& cu_seq_lens,   // [BATCH+1]
+    torch::Tensor const& token_to_seq,  // [MAX_TOKEN_ACROSS_CHUNKS]
+    int64_t num_tokens, const std::string& kv_cache_dtype,
    torch::Tensor const& scale,
    std::optional<torch::Tensor> seq_starts = std::nullopt);

@ -57,6 +49,15 @@ void cp_gather_cache(
    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
    int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);

+// Gather and upconvert FP8 KV cache to BF16 workspace
+void cp_gather_and_upconvert_fp8_kv_cache(
+    torch::Tensor const& src_cache,         // [NUM_BLOCKS, BLOCK_SIZE, 656]
+    torch::Tensor const& dst,               // [TOT_TOKENS, 576]
+    torch::Tensor const& block_table,       // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& seq_lens,          // [BATCH]
+    torch::Tensor const& workspace_starts,  // [BATCH]
+    int64_t batch_size);
+
 // Indexer K quantization and cache function
 void indexer_k_quant_and_cache(
    torch::Tensor& k,             // [num_tokens, head_dim]
@ -71,4 +72,4 @@ void cp_gather_indexer_k_quant_cache(
    torch::Tensor& dst_k,           // [num_tokens, head_dim]
    torch::Tensor& dst_scale,  // [num_tokens, head_dim / quant_block_size * 4]
    const torch::Tensor& block_table,   // [batch_size, num_blocks]
-    const torch::Tensor& cu_seq_lens);  // [batch_size + 1]
+    const torch::Tensor& cu_seq_lens);  // [batch_size + 1]
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -2,6 +2,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAException.h>
+#include <c10/util/Optional.h>

 #include "cuda_utils.h"
 #include "cuda_compat.h"
@ -118,94 +119,6 @@ __global__ void copy_blocks_mla_kernel(

 }  // namespace vllm

-// Note: the key_caches and value_caches vectors are constant but
-// not the Tensors they contain. The vectors need to be const refs
-// in order to satisfy pytorch's C++ operator registration code.
-void copy_blocks(std::vector<torch::Tensor> const& key_caches,
-                 std::vector<torch::Tensor> const& value_caches,
-                 const torch::Tensor& block_mapping) {
-  int num_layers = key_caches.size();
-  TORCH_CHECK(num_layers == value_caches.size());
-  if (num_layers == 0) {
-    return;
-  }
-  torch::Device cache_device = key_caches[0].device();
-  TORCH_CHECK(cache_device.is_cuda());
-
-  // Create data structures for the kernel.
-  // Create an array of pointers to the key and value caches.
-  int64_t key_cache_ptrs[num_layers];
-  int64_t value_cache_ptrs[num_layers];
-  for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
-    key_cache_ptrs[layer_idx] =
-        reinterpret_cast<int64_t>(key_caches[layer_idx].data_ptr());
-    value_cache_ptrs[layer_idx] =
-        reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
-  }
-
-  // block_mapping is a 2D tensor with shape (num_pairs, 2).
-  int num_pairs = block_mapping.size(0);
-
-  // Move the data structures to the GPU.
-  // NOTE: This synchronizes the CPU and GPU.
-  torch::Tensor key_cache_ptrs_tensor =
-      torch::from_blob(key_cache_ptrs, {num_layers}, torch::kInt64)
-          .to(cache_device);
-  torch::Tensor value_cache_ptrs_tensor =
-      torch::from_blob(value_cache_ptrs, {num_layers}, torch::kInt64)
-          .to(cache_device);
-
-  // Launch the kernel.
-  const int numel_per_block = key_caches[0][0].numel();
-  dim3 grid(num_layers, num_pairs);
-  dim3 block(std::min(1024, numel_per_block));
-  const at::cuda::OptionalCUDAGuard device_guard(cache_device);
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
-      key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
-        vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
-            key_cache_ptrs_tensor.data_ptr<int64_t>(),
-            value_cache_ptrs_tensor.data_ptr<int64_t>(),
-            block_mapping.data_ptr<int64_t>(), numel_per_block);
-      }));
-}
-
-// copy blocks kernel for MLA (assumes a joint KV-cache)
-void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
-                     const torch::Tensor& block_mapping) {
-  int num_layers = kv_caches.size();
-  if (num_layers == 0) {
-    return;
-  }
-  torch::Device cache_device = kv_caches[0].device();
-  TORCH_CHECK(cache_device.is_cuda(), "kv_cache must be on CUDA");
-
-  std::vector<int64_t> cache_ptrs(num_layers);
-  for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
-    cache_ptrs[layer_idx] =
-        reinterpret_cast<int64_t>(kv_caches[layer_idx].data_ptr());
-  }
-  torch::Tensor cache_ptrs_tensor =
-      torch::from_blob(cache_ptrs.data(), {num_layers}, torch::kInt64)
-          .to(cache_device);
-
-  int num_pairs = block_mapping.size(0);
-  // We use the stride instead of numel in case the cache is padded for memory
-  // alignment reasons, we assume the blocks data (inclusive of any padding)
-  // is contiguous in memory
-  int mem_footprint_per_block = kv_caches[0].stride(0);
-  dim3 grid(num_layers, num_pairs);
-  dim3 block(std::min(1024, mem_footprint_per_block));
-  const at::cuda::OptionalCUDAGuard device_guard(cache_device);
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
-      kv_caches[0].scalar_type(), "copy_blocks_mla_kernel", ([&] {
-        vllm::copy_blocks_mla_kernel<scalar_t><<<grid, block, 0, stream>>>(
-            cache_ptrs_tensor.data_ptr<int64_t>(),
-            block_mapping.data_ptr<int64_t>(), mem_footprint_per_block);
-      }));
-}
-
 namespace vllm {

 // Used to copy/convert one element
@ -514,7 +427,8 @@ __global__ void indexer_k_quant_and_cache_kernel(
    const int quant_block_size,                // quantization block size
    const int cache_block_size,                // cache block size
    const int cache_stride,  // stride for each token in kv_cache
-    const bool use_ue8m0     // use ue8m0 scale format
+
+    const bool use_ue8m0  // use ue8m0 scale format
 ) {
  constexpr int VEC_SIZE = 4;
  const int64_t token_idx = blockIdx.x;
@ -552,7 +466,11 @@ __global__ void indexer_k_quant_and_cache_kernel(
 #ifndef USE_ROCM
  __syncwarp();
 #endif
+#if defined(__gfx942__)
+  float scale = fmaxf(amax, 1e-4) / 224.0f;
+#else
  float scale = fmaxf(amax, 1e-4) / 448.0f;
+#endif
  if (use_ue8m0) {
    scale = exp2f(ceilf(log2f(scale)));
  }
@ -901,87 +819,80 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
 namespace vllm {

 // grid is launched with dimensions (batch, num_splits)
-template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
+template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt,
+          int ENTRY_SIZE, int CTA_SIZE>
 __global__ void gather_and_maybe_dequant_cache(
-    const cache_t* __restrict__ src_cache,    // [NUM_BLOCKS, BLOCK_SIZE,
-                                              // ENTRIES...]
-    scalar_t* __restrict__ dst,               // [TOT_TOKENS, ENTRIES...]
-    const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
-    const int32_t* __restrict__ cu_seq_lens,  // [BATCH+1]
-    const int32_t block_size, const int32_t entry_size,
+    const cache_t* __restrict__ src_cache,     // [NUM_BLOCKS, BLOCK_SIZE,
+                                               // ENTRIES...]
+    scalar_t* __restrict__ dst,                // [TOT_TOKENS, ENTRIES...]
+    const int32_t* __restrict__ block_table,   // [BATCH, BLOCK_INDICES]
+    const int32_t* __restrict__ cu_seq_lens,   // [BATCH+1]
+    const int32_t* __restrict__ token_to_seq,  // [MAX_TOKEN_ACROSS_CHUNK]
+    const int32_t num_tokens, const int32_t block_size,
    const int64_t block_table_stride, const int64_t cache_block_stride,
    const int64_t cache_entry_stride, const int64_t dst_entry_stride,
    const float* __restrict__ scale,
    const int32_t* __restrict__ seq_starts) {  // Optional: starting offsets per
                                               // batch
+  constexpr int vec_size = sizeof(float4) / sizeof(scalar_t);
+  using ltype = vllm::vec_n_t<cache_t, vec_size>;
+  using stype = vllm::vec_n_t<scalar_t, vec_size>;
+  // We are adding this for code readability which will be optimized out when
+  // build in release.
+  assert(CTA_SIZE == blockDim.x);

-  const int64_t bid = blockIdx.x;  // Batch ID
-  const int32_t num_splits = gridDim.y;
-  const int32_t split = blockIdx.y;
-  const int32_t seq_start = cu_seq_lens[bid];
-  const int32_t seq_end = cu_seq_lens[bid + 1];
-  const int32_t seq_len = seq_end - seq_start;
-  const int32_t tot_blocks = cuda_utils::ceil_div(seq_len, block_size);
-  const int32_t split_blocks = cuda_utils::ceil_div(tot_blocks, num_splits);
+#pragma unroll
+  for (int token_id = blockIdx.x; token_id < num_tokens;
+       token_id += gridDim.x) {
+    int64_t batch_id = token_to_seq[token_id];
+    int64_t batch_start = cu_seq_lens[batch_id];
+    int64_t batch_end = cu_seq_lens[batch_id + 1];
+    int32_t batch_offset = token_id - batch_start;

-  const int32_t split_start = split * split_blocks;
-  const int32_t split_end = min((split + 1) * split_blocks, tot_blocks);
+    if (token_id >= batch_end) return;
+    int32_t offset = 0;
+    if (seq_starts != nullptr) {
+      offset = seq_starts[batch_id];
+    }
+    batch_offset += offset;
+    int32_t block_table_id = batch_offset / block_size;
+    int32_t slot_id = batch_offset % block_size;
+    int32_t block_table_offset = batch_id * block_table_stride + block_table_id;
+    int32_t block_id = block_table[block_table_offset];
+    int64_t cache_offset =
+        block_id * cache_block_stride + slot_id * cache_entry_stride;
+    constexpr int32_t vec_iter_cnt = ENTRY_SIZE / vec_size;
+    scalar_t* dst_ = dst + token_id * dst_entry_stride;
+    cache_t* src_ = const_cast<cache_t*>(src_cache) + cache_offset;

-  const bool is_active_split = (split_start < tot_blocks);
-  const bool is_last_split = (split_end == tot_blocks);
-
-  if (!is_active_split) return;
-
-  int32_t full_blocks_end = split_end;
-  int32_t partial_block_size = 0;
-
-  // Adjust the pointer for the block_table for this batch.
-  // If seq_starts is provided, compute an offset based on (seq_starts[bid] /
-  // page_size)
-  const int32_t batch_offset = bid * block_table_stride;
-  int32_t offset = 0;
-  if (seq_starts != nullptr) {
-    offset = seq_starts[bid] / block_size;
-  }
-  const int32_t* batch_block_table = block_table + batch_offset + offset;
-
-  // Adjust dst pointer based on the cumulative sequence lengths.
-  dst += seq_start * dst_entry_stride;
-
-  if (is_last_split) {
-    partial_block_size = seq_len % block_size;
-    if (partial_block_size) full_blocks_end -= 1;
-  }
-
-  auto copy_entry = [&](const cache_t* __restrict__ _src,
-                        scalar_t* __restrict__ _dst) {
-    for (int i = threadIdx.x; i < entry_size; i += blockDim.x) {
+#pragma unroll
+    for (int idx = threadIdx.x; idx < vec_iter_cnt; idx += CTA_SIZE) {
      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-        _dst[i] = static_cast<scalar_t>(_src[i]);
+        reinterpret_cast<stype*>(dst_)[idx] =
+            static_cast<stype>(reinterpret_cast<ltype*>(src_)[idx]);
      } else {
-        _dst[i] =
-            fp8::scaled_convert<scalar_t, cache_t, kv_dt>(_src[i], *scale);
+        ltype loaded_val = reinterpret_cast<ltype*>(src_)[idx];
+        stype store_val;
+#pragma unroll
+        for (int j = 0; j < vec_size; ++j) {
+          store_val.val[j] = fp8::scaled_convert<scalar_t, cache_t, kv_dt>(
+              loaded_val.val[j], *scale);
+        }
+        reinterpret_cast<stype*>(dst_)[idx] = store_val;
      }
    }
-  };
-
-  for (int pid = split_start; pid < full_blocks_end; ++pid) {
-    auto block_id = batch_block_table[pid];
-    auto block_start_ptr = src_cache + block_id * cache_block_stride;
-    auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
-    for (int eid = 0; eid < block_size; ++eid) {
-      copy_entry(block_start_ptr + eid * cache_entry_stride,
-                 block_dst_ptr + eid * dst_entry_stride);
-    }
-  }
-
-  if (partial_block_size) {
-    auto block_id = batch_block_table[full_blocks_end];
-    auto block_start_ptr = src_cache + block_id * cache_block_stride;
-    auto block_dst_ptr = dst + full_blocks_end * block_size * dst_entry_stride;
-    for (int eid = 0; eid < partial_block_size; ++eid) {
-      copy_entry(block_start_ptr + eid * cache_entry_stride,
-                 block_dst_ptr + eid * dst_entry_stride);
+    // process tail
+    constexpr int32_t tail_cnt = ENTRY_SIZE % vec_size;
+    dst_ = dst_ + ENTRY_SIZE - tail_cnt;
+    src_ = src_ + ENTRY_SIZE - tail_cnt;
+#pragma unroll
+    for (int idx = threadIdx.x; idx < tail_cnt; idx += CTA_SIZE) {
+      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+        dst_[idx] = static_cast<scalar_t>(src_[idx]);
+      } else {
+        dst_[idx] =
+            fp8::scaled_convert<scalar_t, cache_t, kv_dt>(src_[idx], *scale);
+      }
    }
  }
 }
@ -992,34 +903,38 @@ __global__ void gather_and_maybe_dequant_cache(
 // SCALAR_T is the data type of the destination tensor.
 // CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
-#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE)                      \
-  vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE>         \
-      <<<grid, block, 0, stream>>>(                                         \
-          reinterpret_cast<CACHE_T*>(src_cache.data_ptr()),                 \
-          reinterpret_cast<SCALAR_T*>(dst.data_ptr()),                      \
-          block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
-          block_size, entry_size, block_table_stride, cache_block_stride,   \
-          cache_entry_stride, dst_entry_stride,                             \
-          reinterpret_cast<const float*>(scale.data_ptr()), seq_starts_ptr);
+#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE)                        \
+  vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, 576,      \
+                                       thread_block_size>                     \
+      <<<grid, block, 0, stream>>>(                                           \
+          reinterpret_cast<CACHE_T*>(src_cache.data_ptr()),                   \
+          reinterpret_cast<SCALAR_T*>(dst.data_ptr()),                        \
+          block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(),   \
+          token_to_seq.data_ptr<int32_t>(), num_tokens, block_size,           \
+          block_table_stride, cache_block_stride, cache_entry_stride,         \
+          dst_entry_stride, reinterpret_cast<const float*>(scale.data_ptr()), \
+          seq_starts_ptr);

 // Gather sequences from the cache into the destination tensor.
 //  - cu_seq_lens contains the cumulative sequence lengths for each batch
 //  - block_table contains the cache block indices for each sequence
+//  - token_to_seq contains the back mapping from token_id to batch_id
 //  - Optionally, seq_starts (if provided) offsets the starting block index by
 //  (seq_starts[bid] / page_size)
 void gather_and_maybe_dequant_cache(
-    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
-    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
-    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
-    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
-    int64_t batch_size, const std::string& kv_cache_dtype,
+    torch::Tensor const& src_cache,     // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& dst,           // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& block_table,   // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& cu_seq_lens,   // [BATCH+1]
+    torch::Tensor const& token_to_seq,  // [MAX_TOKEN_ACROSS_CHUNKS]
+    int64_t num_tokens, const std::string& kv_cache_dtype,
    torch::Tensor const& scale,
    std::optional<torch::Tensor> seq_starts = std::nullopt) {
  at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  int32_t block_size = src_cache.size(1);
-  int32_t entry_size = src_cache.flatten(2, -1).size(2);
+  int32_t head_dim = dst.size(-1);

  TORCH_CHECK(block_table.dtype() == torch::kInt32,
              "block_table must be int32");
@ -1029,6 +944,9 @@ void gather_and_maybe_dequant_cache(
    TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
                "seq_starts must be int32");
  }
+  TORCH_CHECK(head_dim == 576,
+              "gather_and_maybe_dequant_cache only support the head_dim to 576 "
+              "for better performance")

  TORCH_CHECK(src_cache.device() == dst.device(),
              "src_cache and dst must be on the same device");
@ -1046,10 +964,9 @@ void gather_and_maybe_dequant_cache(
  int64_t cache_entry_stride = src_cache.stride(1);
  int64_t dst_entry_stride = dst.stride(0);

-  // Decide on the number of splits based on the batch size.
-  int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
-  dim3 grid(batch_size, num_splits);
-  dim3 block(1024);
+  constexpr int32_t thread_block_size = 64;
+  dim3 grid(num_tokens);
+  dim3 block(thread_block_size);

  const int32_t* seq_starts_ptr =
      seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
@ -1058,6 +975,82 @@ void gather_and_maybe_dequant_cache(
 }

 namespace vllm {
+
+// Gather and upconvert FP8 KV cache tokens to BF16 workspace
+// Similar to cp_gather_cache but specifically for FP8->BF16 conversion
+__global__ void cp_gather_and_upconvert_fp8_kv_cache(
+    const uint8_t* __restrict__ src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, 656]
+    __nv_bfloat16* __restrict__ dst,          // [TOT_TOKENS, 576]
+    const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
+    const int32_t* __restrict__ seq_lens,     // [BATCH]
+    const int32_t* __restrict__ workspace_starts,  // [BATCH]
+    const int32_t block_size, const int32_t head_dim,
+    const int64_t block_table_stride, const int64_t cache_block_stride,
+    const int64_t cache_entry_stride, const int64_t dst_entry_stride) {
+  const int64_t bid = blockIdx.x;  // Batch ID
+  const int32_t num_splits = gridDim.y;
+  const int32_t split = blockIdx.y;
+  const int32_t seq_start = workspace_starts[bid];
+  const int32_t seq_len = seq_lens[bid];
+  const int32_t tot_slots = seq_len;
+  const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);
+
+  const int32_t split_start = split * split_slots;
+  const int32_t split_end = min((split + 1) * split_slots, tot_slots);
+
+  const bool is_active_split = (split_start < tot_slots);
+
+  if (!is_active_split) return;
+
+  // Adjust the pointer for the block_table for this batch
+  const int32_t batch_offset = bid * block_table_stride;
+  int32_t offset = split_start;
+  int32_t offset_div = offset / block_size;
+  offset = offset % block_size;
+  const int32_t* batch_block_table = block_table + batch_offset;
+
+  // Adjust dst pointer based on the cumulative sequence lengths
+  dst += seq_start * dst_entry_stride;
+
+  const int tid = threadIdx.x;
+
+  // Process each token in this split
+  for (int pid = split_start; pid < split_end; ++pid) {
+    auto block_id = batch_block_table[offset_div];
+    const uint8_t* token_ptr =
+        src_cache + block_id * cache_block_stride + offset * cache_entry_stride;
+    __nv_bfloat16* dst_ptr = dst + pid * dst_entry_stride;
+
+    // FP8 format: 512 bytes fp8 + 16 bytes scales + 128 bytes rope (64 bf16)
+    const uint8_t* no_pe_ptr = token_ptr;
+    const float* scales_ptr = reinterpret_cast<const float*>(token_ptr + 512);
+    const __nv_bfloat16* rope_ptr =
+        reinterpret_cast<const __nv_bfloat16*>(token_ptr + 512 + 16);
+
+    // Parallelize fp8 dequant (512 elements) and rope copy (64 elements)
+    if (tid < 512) {
+      // FP8 dequantization
+      const int tile = tid >> 7;  // each tile is 128 elements
+      const float scale = scales_ptr[tile];
+      const uint8_t val = no_pe_ptr[tid];
+      dst_ptr[tid] =
+          fp8::scaled_convert<__nv_bfloat16, uint8_t,
+                              vllm::Fp8KVCacheDataType::kFp8E4M3>(val, scale);
+    } else if (tid < 576) {
+      // Rope copy (64 bf16 elements)
+      const int rope_idx = tid - 512;
+      dst_ptr[512 + rope_idx] = rope_ptr[rope_idx];
+    }
+
+    // Move to next token
+    offset += 1;
+    if (offset == block_size) {
+      offset_div += 1;
+      offset = 0;
+    }
+  }
+}
+
 template <typename scalar_t>
 // Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by
 // block_size.
@ -1199,6 +1192,57 @@ void cp_gather_cache(
  }
 }

+void cp_gather_and_upconvert_fp8_kv_cache(
+    torch::Tensor const& src_cache,         // [NUM_BLOCKS, BLOCK_SIZE, 656]
+    torch::Tensor const& dst,               // [TOT_TOKENS, 576]
+    torch::Tensor const& block_table,       // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& seq_lens,          // [BATCH]
+    torch::Tensor const& workspace_starts,  // [BATCH]
+    int64_t batch_size) {
+  at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  int32_t block_size = src_cache.size(1);
+  int32_t head_dim = dst.size(1);
+
+  TORCH_CHECK(block_table.dtype() == torch::kInt32,
+              "block_table must be int32");
+  TORCH_CHECK(seq_lens.dtype() == torch::kInt32, "seq_lens must be int32");
+  TORCH_CHECK(workspace_starts.dtype() == torch::kInt32,
+              "workspace_starts must be int32");
+
+  TORCH_CHECK(src_cache.device() == dst.device(),
+              "src_cache and dst must be on the same device");
+  TORCH_CHECK(src_cache.device() == block_table.device(),
+              "src_cache and block_table must be on the same device");
+  TORCH_CHECK(src_cache.device() == seq_lens.device(),
+              "src_cache and seq_lens must be on the same device");
+  TORCH_CHECK(src_cache.device() == workspace_starts.device(),
+              "src_cache and workspace_starts must be on the same device");
+
+  TORCH_CHECK(src_cache.dtype() == torch::kUInt8, "src_cache must be uint8");
+  TORCH_CHECK(dst.dtype() == torch::kBFloat16, "dst must be bfloat16");
+  TORCH_CHECK(head_dim == 576, "head_dim must be 576 for MLA");
+
+  int64_t block_table_stride = block_table.stride(0);
+  int64_t cache_block_stride = src_cache.stride(0);
+  int64_t cache_entry_stride = src_cache.stride(1);
+  int64_t dst_entry_stride = dst.stride(0);
+
+  // Decide on the number of splits based on the batch size
+  int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
+  dim3 grid(batch_size, num_splits);
+  dim3 block(576);
+
+  vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid, block, 0, stream>>>(
+      src_cache.data_ptr<uint8_t>(),
+      reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()),
+      block_table.data_ptr<int32_t>(), seq_lens.data_ptr<int32_t>(),
+      workspace_starts.data_ptr<int32_t>(), block_size, head_dim,
+      block_table_stride, cache_block_stride, cache_entry_stride,
+      dst_entry_stride);
+}
+
 // Macro to dispatch the kernel based on the data type.
 #define CALL_INDEXER_K_QUANT_AND_CACHE(KV_T, CACHE_T, KV_DTYPE)         \
  vllm::indexer_k_quant_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE>       \
--- a/csrc/cpu/cpu_arch_macros.h
+++ b/csrc/cpu/cpu_arch_macros.h
@ -1,5 +1,5 @@
-#ifndef CPU_ATTN_MACROS_H
-#define CPU_ATTN_MACROS_H
+#ifndef CPU_ARCH_MACROS_H
+#define CPU_ARCH_MACROS_H

 // x86_64
 #ifdef __x86_64__
@ -26,7 +26,7 @@
          _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));                  \
      const __m512i vec_127 = _mm512_set1_epi32(0x0000007f);                   \
      const int n_mantissa_bits = 23;                                          \
-      auto fast_exp = [&](vec_op::FP32Vec16& vec) __attribute__((              \
+      auto fast_exp = [&](const vec_op::FP32Vec16& vec) __attribute__((        \
                          always_inline)) {                                    \
        __m512 values = vec.reg;                                               \
        auto less_ln_flt_min_mask =                                            \
@ -60,4 +60,54 @@

 #endif

-#endif
+#ifdef __aarch64__
+  // Implementation copied from Arm Optimized Routines (expf AdvSIMD)
+  // https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c
+  #include <limits>
+  #define DEFINE_FAST_EXP                                                      \
+    const float32x4_t inv_ln2 = vdupq_n_f32(0x1.715476p+0f);                   \
+    const float ln2_hi = 0x1.62e4p-1f;                                         \
+    const float ln2_lo = 0x1.7f7d1cp-20f;                                      \
+    const float c0 = 0x1.0e4020p-7f;                                           \
+    const float c2 = 0x1.555e66p-3f;                                           \
+    const float32x4_t ln2_c02 = {ln2_hi, ln2_lo, c0, c2};                      \
+    const uint32x4_t exponent_bias = vdupq_n_u32(0x3f800000);                  \
+    const float32x4_t c1 = vdupq_n_f32(0x1.573e2ep-5f);                        \
+    const float32x4_t c3 = vdupq_n_f32(0x1.fffdb6p-2f);                        \
+    const float32x4_t c4 = vdupq_n_f32(0x1.ffffecp-1f);                        \
+    const float32x4_t pos_special_bound = vdupq_n_f32(0x1.5d5e2ap+6f);         \
+    const float32x4_t neg_special_bound = vnegq_f32(pos_special_bound);        \
+    const float32x4_t inf =                                                    \
+        vdupq_n_f32(std::numeric_limits<float>::infinity());                   \
+    const float32x4_t zero = vdupq_n_f32(0.0f);                                \
+    auto neon_expf = [&](float32x4_t values) __attribute__((always_inline)) {  \
+      float32x4_t n = vrndaq_f32(vmulq_f32(values, inv_ln2));                  \
+      float32x4_t r = vfmsq_laneq_f32(values, n, ln2_c02, 0);                  \
+      r = vfmsq_laneq_f32(r, n, ln2_c02, 1);                                   \
+      uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_s32(vcvtq_s32_f32(n)), 23); \
+      float32x4_t scale = vreinterpretq_f32_u32(vaddq_u32(e, exponent_bias));  \
+      float32x4_t r2 = vmulq_f32(r, r);                                        \
+      float32x4_t p = vfmaq_laneq_f32(c1, r, ln2_c02, 2);                      \
+      float32x4_t q = vfmaq_laneq_f32(c3, r, ln2_c02, 3);                      \
+      q = vfmaq_f32(q, p, r2);                                                 \
+      p = vmulq_f32(c4, r);                                                    \
+      float32x4_t poly = vfmaq_f32(p, q, r2);                                  \
+      poly = vfmaq_f32(scale, poly, scale);                                    \
+      const uint32x4_t hi_mask = vcgeq_f32(values, pos_special_bound);         \
+      const uint32x4_t lo_mask = vcleq_f32(values, neg_special_bound);         \
+      poly = vbslq_f32(hi_mask, inf, poly);                                    \
+      return vbslq_f32(lo_mask, zero, poly);                                   \
+    };                                                                         \
+    auto fast_exp = [&](const vec_op::FP32Vec16& vec)                          \
+                        __attribute__((always_inline)) {                       \
+                          float32x4x4_t result;                                \
+                          result.val[0] = neon_expf(vec.reg.val[0]);           \
+                          result.val[1] = neon_expf(vec.reg.val[1]);           \
+                          result.val[2] = neon_expf(vec.reg.val[2]);           \
+                          result.val[3] = neon_expf(vec.reg.val[3]);           \
+                          return vec_op::FP32Vec16(result);                    \
+                        };
+
+#endif  // __aarch64__
+
+#endif
--- a/csrc/cpu/cpu_attn.cpp
+++ b/csrc/cpu/cpu_attn.cpp
@ -13,6 +13,18 @@
  #define AMX_DISPATCH(...) case cpu_attention::ISA::AMX:
 #endif

+#ifdef __aarch64__
+  #include "cpu_attn_neon.hpp"
+  #define NEON_DISPATCH(...)                                                   \
+    case cpu_attention::ISA::NEON: {                                           \
+      using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::NEON, \
+                                                     scalar_t, head_dim>;      \
+      return __VA_ARGS__();                                                    \
+    }
+#else
+  #define NEON_DISPATCH(...) case cpu_attention::ISA::NEON:
+#endif  // #ifdef __aarch64__
+
 #define CPU_ATTN_DISPATCH_CASE(HEAD_DIM, ...) \
  case HEAD_DIM: {                            \
    constexpr size_t head_dim = HEAD_DIM;     \
@ -41,6 +53,7 @@
  [&] {                                                                       \
    switch (ISA_TYPE) {                                                       \
      AMX_DISPATCH(__VA_ARGS__)                                               \
+      NEON_DISPATCH(__VA_ARGS__)                                              \
      case cpu_attention::ISA::VEC: {                                         \
        using attn_impl =                                                     \
            cpu_attention::AttentionImpl<cpu_attention::ISA::VEC, scalar_t,   \
@ -73,6 +86,8 @@ torch::Tensor get_scheduler_metadata(
    isa = cpu_attention::ISA::VEC;
  } else if (isa_hint == "vec16") {
    isa = cpu_attention::ISA::VEC16;
+  } else if (isa_hint == "neon") {
+    isa = cpu_attention::ISA::NEON;
  } else {
    TORCH_CHECK(false, "Unsupported CPU attention ISA hint: " + isa_hint);
  }
@ -102,7 +117,6 @@ torch::Tensor get_scheduler_metadata(
  input.casual = casual;
  input.isa = isa;
  input.enable_kv_split = enable_kv_split;
-  TORCH_CHECK(casual, "Only supports casual mask for now.");

  VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
    CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
@ -158,6 +172,8 @@ void cpu_attn_reshape_and_cache(
      return cpu_attention::ISA::VEC;
    } else if (isa == "vec16") {
      return cpu_attention::ISA::VEC16;
+    } else if (isa == "neon") {
+      return cpu_attention::ISA::NEON;
    } else {
      TORCH_CHECK(false, "Invalid ISA type: " + isa);
    }
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@ -8,13 +8,11 @@
  #include <sys/sysctl.h>
 #endif

-#include "cpu_types.hpp"
-#include "scratchpad_manager.h"
-#include "cpu_attn_macros.h"
-#include "utils.hpp"
+#include "cpu/cpu_arch_macros.h"
+#include "cpu/utils.hpp"

 namespace cpu_attention {
-enum class ISA { AMX, VEC, VEC16 };
+enum class ISA { AMX, VEC, VEC16, NEON };

 template <ISA isa, typename scalar_t, int64_t head_dim>
 class AttentionImpl {};
@ -143,6 +141,12 @@ struct AttentionMetadata {
      case ISA::VEC:
        ss << "VEC, ";
        break;
+      case ISA::VEC16:
+        ss << "VEC16, ";
+        break;
+      case ISA::NEON:
+        ss << "NEON, ";
+        break;
    }
    ss << "workitem_group_num: " << workitem_group_num
       << ", reduction_item_num: " << reduction_item_num
@ -180,7 +184,7 @@ struct AttentionMetadata {
 //  - Intermediate outputs: q_tile_size * head_dim * output_buffer_elem_size + 2
 //  * q_tile_size * 4, partial output, max + sum (float)
 // Reduction scratchpad contains:
-//  - flags: bool array to indicate wether the split is finished
+//  - flags: bool array to indicate whether the split is finished
 //  - outputs: split_num * q_tile_size * head_dim * output_buffer_elem_size
 //  - max, sum: 2 * split_num * q_tile_size * 4
 class AttentionScratchPad {
@ -372,12 +376,13 @@ class AttentionScheduler {

  static constexpr int32_t MaxQTileIterNum = 128;

-  AttentionScheduler() : available_cache_size_(get_available_l2_size()) {}
+  AttentionScheduler()
+      : available_cache_size_(cpu_utils::get_available_l2_size()) {}

  torch::Tensor schedule(const ScheduleInput& input) const {
    const bool casual = input.casual;
    const int32_t thread_num = omp_get_max_threads();
-    const int64_t cache_size = get_available_l2_size();
+    const int64_t cache_size = cpu_utils::get_available_l2_size();
    const int32_t max_num_q_per_iter = input.max_num_q_per_iter;
    const int32_t kv_len_alignment = input.kv_block_alignment;
    int32_t q_head_per_kv = input.num_heads_q / input.num_heads_kv;
@ -653,7 +658,7 @@ class AttentionScheduler {
            metadata_ptr->thread_num +
        metadata_ptr->reduction_scratchpad_size_per_kv_head *
            (use_gqa ? input.num_heads_kv : input.num_heads_q);
-    DNNLScratchPadManager::get_dnnl_scratchpad_manager()->realloc(
+    cpu_utils::ScratchPadManager::get_scratchpad_manager()->realloc(
        scratchpad_size);

    // metadata_ptr->print();
@ -661,7 +666,7 @@ class AttentionScheduler {
    // test out of boundary access
    // {
    //     float* cache_ptr =
-    //     DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<float>();
+    //     cpu_utils::ScratchPadManager::getl_scratchpad_manager()->get_data<float>();
    //     for (int64_t i = 0; i < scratchpad_size / sizeof(float); ++i) {
    //         cache_ptr[i] = std::numeric_limits<float>::quiet_NaN();
    //     }
@ -743,27 +748,6 @@ class AttentionScheduler {
    return std::max(rounded_tile_size, round_size);
  }

-  static int64_t get_available_l2_size() {
-    static int64_t size = []() {
-#if defined(__APPLE__)
-      // macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname.
-      int64_t l2_cache_size = 0;
-      size_t len = sizeof(l2_cache_size);
-      if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 &&
-          l2_cache_size > 0) {
-        return l2_cache_size >> 1;  // use 50% of L2 cache
-      }
-      // Fallback if sysctlbyname fails
-      return 128LL * 1024 >> 1;  // use 50% of 128KB
-#else
-      long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
-      TORCH_CHECK_NE(l2_cache_size, -1);
-      return l2_cache_size >> 1;  // use 50% of L2 cache
-#endif
-    }();
-    return size;
-  }
-
 private:
  int64_t available_cache_size_;
 };
@ -841,7 +825,7 @@ struct VecTypeTrait<c10::BFloat16> {
 };
 #endif

-#if !defined(__powerpc__)
+#if !defined(__powerpc__) && !defined(__s390x__)
 template <>
 struct VecTypeTrait<c10::Half> {
  using vec_t = vec_op::FP16Vec16;
@ -1240,14 +1224,8 @@ class AttentionMainLoop {
        // rescale sum and partial outputs
        if (need_rescale) {
          // compute rescale factor
-#ifdef DEFINE_FAST_EXP
-          vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
-          rescale_factor_vec = fast_exp(rescale_factor_vec);
-          rescale_factor = rescale_factor_vec.get_last_elem();
-#else
          rescale_factor = std::exp(rescale_factor);
          vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
-#endif

          // rescale sum
          new_sum_val += rescale_factor * init_sum_val;
@ -1402,7 +1380,7 @@ class AttentionMainLoop {

      // init buffers
      void* scratchpad_ptr =
-          DNNLScratchPadManager::get_dnnl_scratchpad_manager()
+          cpu_utils::ScratchPadManager::get_scratchpad_manager()
              ->get_data<void>();
      AttentionScratchPad buffer_manager(thread_id, metadata, scratchpad_ptr);

@ -1422,8 +1400,7 @@ class AttentionMainLoop {
        }
      }

-      const int64_t available_cache_size =
-          AttentionScheduler::get_available_l2_size();
+      const int64_t available_cache_size = cpu_utils::get_available_l2_size();
      const int32_t default_tile_size =
          AttentionScheduler::calcu_default_tile_size(
              available_cache_size, head_dim, sizeof(kv_cache_t),
@ -1883,15 +1860,8 @@ class AttentionMainLoop {
                                   : curr_output_buffer;
          float rescale_factor = final_max > curr_max ? curr_max - final_max
                                                      : final_max - curr_max;
-
-#ifdef DEFINE_FAST_EXP
-          vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
-          rescale_factor_vec = fast_exp(rescale_factor_vec);
-          rescale_factor = rescale_factor_vec.get_last_elem();
-#else
          rescale_factor = std::exp(rescale_factor);
          vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
-#endif

          local_sum[head_idx] = final_max > curr_max
                                    ? final_sum + rescale_factor * curr_sum
--- a/csrc/cpu/cpu_attn_neon.hpp
+++ b/csrc/cpu/cpu_attn_neon.hpp
@ -0,0 +1,386 @@
+#ifndef CPU_ATTN_NEON_HPP
+#define CPU_ATTN_NEON_HPP
+
+#include "cpu_attn_impl.hpp"
+#include <arm_neon.h>
+#include <type_traits>
+namespace cpu_attention {
+
+namespace {
+
+#define BLOCK_SIZE_ALIGNMENT 32
+#define HEAD_SIZE_ALIGNMENT 32
+#define MAX_Q_HEAD_NUM_PER_ITER 16
+
+// These do not use vectorized class for loading / converting
+// because csrc/cpu/cpu_types_arm.hpp does not have fallback options
+// for vec_op::BF16Vec* / vec_op::BF16Vec* on Arm HW that
+// doesn't support BF16.
+// We don't use vec_op::FP32Vec* or vec_op::FP16Vec* for consistency.
+template <typename kv_cache_t>
+FORCE_INLINE void load_row8_B_as_f32(const kv_cache_t* p, float32x4_t& b0,
+                                     float32x4_t& b1);
+
+template <>
+FORCE_INLINE void load_row8_B_as_f32<float>(const float* p, float32x4_t& b0,
+                                            float32x4_t& b1) {
+  b0 = vld1q_f32(p + 0);
+  b1 = vld1q_f32(p + 4);
+}
+
+template <>
+FORCE_INLINE void load_row8_B_as_f32<c10::Half>(const c10::Half* p,
+                                                float32x4_t& b0,
+                                                float32x4_t& b1) {
+  const float16_t* h = reinterpret_cast<const float16_t*>(p);
+  float16x8_t v = vld1q_f16(h);
+  b0 = vcvt_f32_f16(vget_low_f16(v));
+  b1 = vcvt_f32_f16(vget_high_f16(v));
+}
+
+template <>
+FORCE_INLINE void load_row8_B_as_f32<c10::BFloat16>(const c10::BFloat16* p,
+                                                    float32x4_t& b0,
+                                                    float32x4_t& b1) {
+  const uint16_t* u = reinterpret_cast<const uint16_t*>(p);
+#ifdef ARM_BF16_SUPPORT
+  uint16x8_t u0 = vld1q_u16(u);
+  bfloat16x8_t bf0 = vreinterpretq_bf16_u16(u0);
+  b0 = vcvtq_low_f32_bf16(bf0);
+  b1 = vcvtq_high_f32_bf16(bf0);
+#else
+  uint16x8_t x0 = vld1q_u16(u);
+  uint32x4_t lo = vshlq_n_u32(vmovl_u16(vget_low_u16(x0)), 16);
+  uint32x4_t hi = vshlq_n_u32(vmovl_u16(vget_high_u16(x0)), 16);
+  b0 = vreinterpretq_f32_u32(lo);
+  b1 = vreinterpretq_f32_u32(hi);
+#endif
+}
+
+// Mx8, with 1 <= M <= 8 , K streamed, unroll-by-4 with NEON FMLAs
+// #Loads = (K // 4) * (M + 4 * sizeof(kv_cache_t) / 2)
+// #FMLAs = (K // 4) * (4 * 2 * M)
+// We have (4 * 2 * M) FMLAs for (M + 4 * sizeof(kv_cache_t) / 2) loads
+template <int32_t M, typename kv_cache_t>
+FORCE_INLINE void gemm_micro_neon_fmla_Mx8_Ku4(
+    const float* __restrict A,       // [M x K],
+    const kv_cache_t* __restrict B,  // [K x 8],
+    float* __restrict C,             // [M x 8],
+    int64_t lda, int64_t ldb, int64_t ldc, int32_t K, bool accumulate) {
+  // kernel supports max M of 8, as it'd spill for larger M
+  static_assert(1 <= M && M <= 8, "M must be in [1,8]");
+
+// helpers for per-M codegen
+#define ROWS_APPLY(OP) OP(0) OP(1) OP(2) OP(3) OP(4) OP(5) OP(6) OP(7)
+#define IF_M(i) if constexpr (M > (i))
+
+  // A row base pointers
+#define DECL_A(i) const float* a##i = A + (i) * lda;
+  ROWS_APPLY(DECL_A)
+#undef DECL_A
+
+  // declare 2 accumulators per row of M
+#define DECL_ACC(i) float32x4_t acc##i##_0, acc##i##_1;
+  ROWS_APPLY(DECL_ACC)
+#undef DECL_ACC
+
+  // initialize accumulators
+#define INIT_ACC(i)                              \
+  IF_M(i) {                                      \
+    if (accumulate) {                            \
+      acc##i##_0 = vld1q_f32(C + (i) * ldc + 0); \
+      acc##i##_1 = vld1q_f32(C + (i) * ldc + 4); \
+    } else {                                     \
+      acc##i##_0 = vdupq_n_f32(0.f);             \
+      acc##i##_1 = vdupq_n_f32(0.f);             \
+    }                                            \
+  }
+  ROWS_APPLY(INIT_ACC)
+#undef INIT_ACC
+
+  int32_t k = 0;
+
+  // K unrolled by 4
+  for (; k + 3 < K; k += 4) {
+    // load A[k..k+3] for each active row (M)
+#define LOAD_A4(i)     \
+  float32x4_t a##i##v; \
+  IF_M(i) a##i##v = vld1q_f32(a##i + k);
+    ROWS_APPLY(LOAD_A4)
+#undef LOAD_A4
+
+    // helper: FMA lane L from aiv
+#define FMAS_LANE(i, aiv, L)                              \
+  IF_M(i) {                                               \
+    acc##i##_0 = vfmaq_laneq_f32(acc##i##_0, b0, aiv, L); \
+    acc##i##_1 = vfmaq_laneq_f32(acc##i##_1, b1, aiv, L); \
+  }
+
+    // k + 0
+    {
+      float32x4_t b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 0) * ldb, b0, b1);
+#define STEP_K0(i) FMAS_LANE(i, a##i##v, 0)
+      ROWS_APPLY(STEP_K0)
+#undef STEP_K0
+    }
+    // k + 1
+    {
+      float32x4_t b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 1) * ldb, b0, b1);
+#define STEP_K1(i) FMAS_LANE(i, a##i##v, 1)
+      ROWS_APPLY(STEP_K1)
+#undef STEP_K1
+    }
+    // k + 2
+    {
+      float32x4_t b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 2) * ldb, b0, b1);
+#define STEP_K2(i) FMAS_LANE(i, a##i##v, 2)
+      ROWS_APPLY(STEP_K2)
+#undef STEP_K2
+    }
+    // k + 3
+    {
+      float32x4_t b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 3) * ldb, b0, b1);
+#define STEP_K3(i) FMAS_LANE(i, a##i##v, 3)
+      ROWS_APPLY(STEP_K3)
+#undef STEP_K3
+    }
+#undef FMAS_LANE
+  }
+
+  // K tail
+  for (; k < K; ++k) {
+    float32x4_t b0, b1;
+    load_row8_B_as_f32<kv_cache_t>(B + (int64_t)k * ldb, b0, b1);
+#define TAIL_ROW(i)                             \
+  IF_M(i) {                                     \
+    float32x4_t ai = vdupq_n_f32(*(a##i + k));  \
+    acc##i##_0 = vfmaq_f32(acc##i##_0, b0, ai); \
+    acc##i##_1 = vfmaq_f32(acc##i##_1, b1, ai); \
+  }
+    ROWS_APPLY(TAIL_ROW)
+#undef TAIL_ROW
+  }
+
+  // store accumulators to C
+#define STORE_ROW(i)                          \
+  IF_M(i) {                                   \
+    vst1q_f32(C + (i) * ldc + 0, acc##i##_0); \
+    vst1q_f32(C + (i) * ldc + 4, acc##i##_1); \
+  }
+  ROWS_APPLY(STORE_ROW)
+#undef STORE_ROW
+
+#undef ROWS_APPLY
+#undef IF_M
+}
+
+template <int32_t N, typename kv_cache_t>
+FORCE_INLINE void gemm_macro_neon_fmla_Mx8_Ku4(const float* __restrict A,
+                                               const kv_cache_t* __restrict B,
+                                               float* __restrict C, int32_t M,
+                                               int32_t K, int64_t lda,
+                                               int64_t ldb, int64_t ldc,
+                                               bool accumulate) {
+  // micro kernel is Mx8
+  static_assert(N % 8 == 0, "N must be a multiple of 8");
+  for (int32_t m = 0; m < M;) {
+    int32_t mb = (M - m >= 8) ? 8 : (M - m >= 4) ? 4 : (M - m >= 2) ? 2 : 1;
+    const float* Ab = A + m * lda;
+    float* Cb = C + m * ldc;
+
+    for (int32_t n = 0; n < N; n += 8) {
+      const kv_cache_t* Bn = B + n;
+      float* Cn = Cb + n;
+      switch (mb) {
+        case 8:
+          gemm_micro_neon_fmla_Mx8_Ku4<8, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                      K, accumulate);
+          break;
+        case 4:
+          gemm_micro_neon_fmla_Mx8_Ku4<4, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                      K, accumulate);
+          break;
+        case 2:
+          gemm_micro_neon_fmla_Mx8_Ku4<2, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                      K, accumulate);
+          break;
+        default:
+          gemm_micro_neon_fmla_Mx8_Ku4<1, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                      K, accumulate);
+          break;
+      }
+    }
+    // no tail loop for N as it's guaranteed to be a multiple of 8
+    m += mb;
+  }
+}
+
+template <typename kv_cache_t>
+class TileGemmNeonFMLA {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size,
+                                float* __restrict__ a_tile,
+                                kv_cache_t* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    if constexpr (phase == AttentionGemmPhase::QK) {
+      gemm_macro_neon_fmla_Mx8_Ku4<BLOCK_SIZE_ALIGNMENT, kv_cache_t>(
+          a_tile, b_tile, c_tile, m_size, k_size, lda, ldb, ldc, accum_c);
+    } else {
+      gemm_macro_neon_fmla_Mx8_Ku4<HEAD_SIZE_ALIGNMENT, kv_cache_t>(
+          a_tile, b_tile, c_tile, m_size, dynamic_k_size, lda, ldb, ldc,
+          accum_c);
+    }
+  }
+};
+
+}  // namespace
+
+// this is similar to "ISA::VEC" at the moment
+template <typename scalar_t, int64_t head_dim>
+class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
+ public:
+  using query_t = scalar_t;
+  using q_buffer_t = float;
+  using kv_cache_t = scalar_t;
+  using logits_buffer_t = float;
+  using partial_output_buffer_t = float;
+  using prob_buffer_t = float;
+
+  constexpr static int64_t BlockSizeAlignment =
+      BLOCK_SIZE_ALIGNMENT;  // KV token num unit of QK and PV phases
+  constexpr static int64_t HeadDimAlignment =
+      HEAD_SIZE_ALIGNMENT;  // headdim num unit of PV phase
+  constexpr static int64_t MaxQHeadNumPerIteration = MAX_Q_HEAD_NUM_PER_ITER;
+  constexpr static int64_t HeadDim = head_dim;
+  constexpr static ISA ISAType = ISA::NEON;
+  constexpr static bool scale_on_logits = false;  // apply scale on q_buffer
+
+  static_assert(HeadDim % HeadDimAlignment == 0);
+  // the gemm micro kernel is Mx8
+  static_assert(HeadDimAlignment % 8 == 0);
+  static_assert(BlockSizeAlignment % 8 == 0);
+
+ public:
+  template <template <typename tile_gemm_t> typename attention>
+  FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    attention<TileGemmNeonFMLA<kv_cache_t>> attention_iteration;
+    attention_iteration(CPU_ATTENTION_PARAMS);
+  }
+
+  // k_cache_token_group_stride: stride of K cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t k_cache_token_group_stride(
+      const int32_t block_size) {
+    return BlockSizeAlignment;  // layout of k_cache block is [head_dim,
+                                // block_size], row-major
+  }
+
+  // v_cache_token_group_stride: stride of V cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t v_cache_token_group_stride(
+      const int32_t block_size) {
+    return head_dim * BlockSizeAlignment;  // layout of v_cache is [block_size,
+                                           // head_dim], row-major
+  }
+
+  // v_cache_head_group_stride: stride of V cache when move to next
+  // HeadDimAlignment head dims in a block
+  constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
+    return HeadDimAlignment;  // layout of v_cache is [block_size, head_dim],
+                              // row-major
+  }
+
+  // Copy q to q_buffer and cast it to fp32
+  static void copy_q_heads_tile(
+      scalar_t* __restrict__ src,  // [q_num, q_heads_per_kv, head_size]
+      float* __restrict__ q_buffer, const int32_t q_num,
+      const int32_t q_heads_per_kv, const int64_t q_num_stride,
+      const int64_t q_head_stride, float scale) {
+    static_assert(head_dim % 16 == 0);
+    constexpr int32_t unroll_size = head_dim / 16;
+    using load_vec_t = typename VecTypeTrait<scalar_t>::vec_t;
+
+    vec_op::FP32Vec16 scale_vec(scale);
+    for (int32_t q_num_idx = 0; q_num_idx < q_num; ++q_num_idx) {
+      for (int32_t q_head_idx = 0; q_head_idx < q_heads_per_kv; ++q_head_idx) {
+        scalar_t* __restrict__ curr_q =
+            src + q_num_idx * q_num_stride + q_head_idx * q_head_stride;
+        float* __restrict__ curr_q_buffer =
+            q_buffer + q_num_idx * q_heads_per_kv * head_dim +
+            q_head_idx * head_dim;
+
+        vec_op::unroll_loop<int32_t, unroll_size>([&](int32_t i) {
+          load_vec_t vec(curr_q);
+          vec_op::FP32Vec16 fp32_vec(vec);
+          fp32_vec = fp32_vec * scale_vec;
+          fp32_vec.save(curr_q_buffer);
+
+          curr_q += 16;
+          curr_q_buffer += 16;
+        });
+      }
+    }
+  }
+
+  // reshape K as column-major and V as row-major
+  static void reshape_and_cache(
+      const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
+      scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
+      const int64_t* __restrict__ slot_mapping, const int64_t token_num,
+      const int64_t key_token_num_stride, const int64_t value_token_num_stride,
+      const int64_t head_num, const int64_t key_head_num_stride,
+      const int64_t value_head_num_stride, const int64_t num_blocks,
+      const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
+      const int64_t block_size, const int64_t block_size_stride) {
+#pragma omp parallel for collapse(2)
+    for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
+      for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
+        const int64_t pos = slot_mapping[token_idx];
+        if (pos < 0) {
+          // skip
+          continue;
+        }
+
+        const int64_t block_idx = pos / block_size;
+        const int64_t block_offset = pos % block_size;
+        {
+          // Write Key
+          const scalar_t* key_start_ptr = key +
+                                          token_idx * key_token_num_stride +
+                                          head_idx * key_head_num_stride;
+          scalar_t* key_cache_start_ptr =
+              key_cache + block_idx * num_blocks_stride +
+              head_idx * cache_head_num_stride + block_offset;
+
+#pragma GCC unroll 8
+          for (int64_t i = 0, j = 0; i < head_dim; ++i, j += block_size) {
+            key_cache_start_ptr[j] = key_start_ptr[i];
+          }
+        }
+        {
+          // Write Value
+          const scalar_t* value_start_ptr = value +
+                                            token_idx * value_token_num_stride +
+                                            head_idx * value_head_num_stride;
+          scalar_t* value_cache_start_ptr =
+              value_cache + block_idx * num_blocks_stride +
+              head_idx * cache_head_num_stride + block_offset * head_dim;
+          std::memcpy(value_cache_start_ptr, value_start_ptr,
+                      sizeof(scalar_t) * head_dim);
+        }
+      }
+    }
+  }
+};
+}  // namespace cpu_attention
+
+#endif  // #ifndef CPU_ATTN_NEON_HPP
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml`