Merge remote-tracking branch 'origin/main' into conftest/generate_beam_search/simplify-return-value

2026-04-09 09:07:03 +08:00 · 2025-12-11 19:39:04 +00:00 · 2025-12-11 19:39:04 +00:00 · b13842b68f
commit b13842b68f
parent 95af481818 72aaac5b66
347 changed files with 12123 additions and 3325 deletions
--- a/.buildkite/ci_config.yaml
+++ b/.buildkite/ci_config.yaml
@ -0,0 +1,24 @@
+name: vllm_ci
+job_dirs:
+  - ".buildkite/test_areas"
+  - ".buildkite/image_build"
+run_all_patterns:
+  - "docker/Dockerfile"
+  - "CMakeLists.txt"
+  - "requirements/common.txt"
+  - "requirements/cuda.txt"
+  - "requirements/build.txt"
+  - "requirements/test.txt"
+  - "setup.py"
+  - "csrc/"
+  - "cmake/"
+run_all_exclude_patterns:
+  - "docker/Dockerfile."
+  - "csrc/cpu/"
+  - "csrc/rocm/"
+  - "cmake/hipify.py"
+  - "cmake/cpu_extension.cmake"
+registries: public.ecr.aws/q9t5s3a7
+repositories:
+  main: "vllm-ci-postmerge-repo"
+  premerge: "vllm-ci-test-repo"
--- a/.buildkite/image_build/image_build.sh
+++ b/.buildkite/image_build/image_build.sh
@ -0,0 +1,56 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 8 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+BRANCH=$4
+VLLM_USE_PRECOMPILED=$5
+VLLM_MERGE_BASE_COMMIT=$6
+CACHE_FROM=$7
+CACHE_TO=$8
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+
+# docker buildx 
+docker buildx create --name vllm-builder --driver docker-container --use
+docker buildx inspect --bootstrap
+docker buildx ls
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
+  merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
+else
+  merge_base_commit_build_args=""
+fi
+
+# build
+docker buildx build --file docker/Dockerfile \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --build-arg USE_SCCACHE=1 \
+  --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
+  --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
+  --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
+  ${merge_base_commit_build_args} \
+  --cache-from type=registry,ref=${CACHE_FROM},mode=max \
+  --cache-to type=registry,ref=${CACHE_TO},mode=max \
+  --tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
+  $( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
+  --push \
+  --target test \
+  --progress plain .
--- a/.buildkite/image_build/image_build.yaml
+++ b/.buildkite/image_build/image_build.yaml
@ -0,0 +1,57 @@
+group: Abuild
+steps:
+  - label: ":docker: Build image"
+    key: image-build
+    depends_on: []
+    commands:
+    - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
+
+  - label: ":docker: Build CPU image"
+    key: image-build-cpu
+    depends_on: []
+    commands:
+    - .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
+
+  - label: ":docker: Build HPU image"
+    soft_fail: true
+    depends_on: []
+    key: image-build-hpu
+    commands:
+    - .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
+  
+  - label: ":docker: Build CPU arm64 image"
+    key: cpu-arm64-image-build
+    depends_on: []
+    optional: true
+    commands:
+    - .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
--- a/.buildkite/image_build/image_build_cpu.sh
+++ b/.buildkite/image_build/image_build_cpu.sh
@ -0,0 +1,36 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+# build
+docker build --file docker/Dockerfile.cpu \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --build-arg VLLM_CPU_AVX512BF16=true \
+  --build-arg VLLM_CPU_AVX512VNNI=true \
+  --build-arg VLLM_CPU_AMXBF16=true \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+  --target vllm-test \
+  --progress plain .
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
--- a/.buildkite/image_build/image_build_cpu_arm64.sh
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@ -0,0 +1,33 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+# build
+docker build --file docker/Dockerfile.cpu \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+  --target vllm-test \
+  --progress plain .
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
--- a/.buildkite/image_build/image_build_hpu.sh
+++ b/.buildkite/image_build/image_build_hpu.sh
@ -0,0 +1,34 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+# build
+docker build \
+  --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
+  --progress plain \
+  https://github.com/vllm-project/vllm-gaudi.git
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -15,6 +15,21 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"

+  - label: "Build arm64 wheel - CUDA 13.0"
+    depends_on: ~
+    id: build-wheel-arm64-cuda-13-0
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
+      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04  --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
+    env:
+      DOCKER_BUILDKIT: "1"
+
  # aarch64 build
  - label: "Build arm64 CPU wheel"
    depends_on: ~
@ -25,7 +40,7 @@ steps:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
    env:
      DOCKER_BUILDKIT: "1"

@ -39,7 +54,7 @@ steps:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
    env:
      DOCKER_BUILDKIT: "1"

@ -52,7 +67,7 @@ steps:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
    env:
      DOCKER_BUILDKIT: "1"

--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@ -372,6 +372,17 @@ if __name__ == "__main__":

    print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")

+    # keep only "official" files for a non-nightly version (specifed by cli args)
+    PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
+    if PY_VERSION_RE.match(version):
+        # upload-wheels.sh ensures no "dev" is in args.version
+        wheel_files = list(
+            filter(lambda x: version in x and "dev" not in x, wheel_files)
+        )
+        print(f"Non-nightly version detected, wheel files used: {wheel_files}")
+    else:
+        print("Nightly version detected, keeping all wheel files.")
+
    # Generate index and metadata, assuming wheels and indices are stored as:
    # s3://vllm-wheels/{version}/<wheel files>
    # s3://vllm-wheels/<anything>/<index files>
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@ -36,11 +36,17 @@ function cpu_tests() {
    set -e
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"

+  # Run model tests
+  docker exec cpu-test bash -c "
+    set -e
+    pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
+
  # Run kernel tests
  docker exec cpu-test bash -c "
    set -e
    pytest -x -v -s tests/kernels/test_onednn.py
-    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
+    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
+    pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"

  # basic online serving
  docker exec cpu-test bash -c '
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -47,6 +47,6 @@ docker run \
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
    pytest -v -s v1/structured_output
    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
-    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
+    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
    pytest -v -s v1/test_serial_utils.py
 '
--- a/.buildkite/scripts/run-prime-rl-test.sh
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@ -12,6 +12,11 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
 PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
 PRIME_RL_DIR="${REPO_ROOT}/prime-rl"

+if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
+    echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
+    exit 0
+fi
+
 echo "Setting up Prime-RL integration test environment..."

 # Clean up any existing Prime-RL directory
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
@ -1,73 +0,0 @@
-#!/usr/bin/env bash
-set -euxo pipefail
-
-# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
-THRESHOLD=${1:-0.25}
-NUM_Q=${2:-1319}
-PORT=${3:-8030}
-OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
-mkdir -p "${OUT_DIR}"
-
-wait_for_server() {
-  local port=$1
-  timeout 600 bash -c '
-    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
-      sleep 1
-    done'
-}
-
-MODEL="deepseek-ai/DeepSeek-V2-lite"
-
-# Set BACKENDS based on platform
-if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
-  # ROCm platform
-  BACKENDS=("allgather_reducescatter")
-  # Disable MOE padding for ROCm since it is causing eplb to fail
-  export VLLM_ROCM_MOE_PADDING=0
-else
-  # Non-ROCm platform (CUDA/other)
-  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
-fi
-
-cleanup() {
-  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
-    kill "${SERVER_PID}" 2>/dev/null || true
-    for _ in {1..20}; do
-      kill -0 "${SERVER_PID}" 2>/dev/null || break
-      sleep 0.5
-    done
-    kill -9 "${SERVER_PID}" 2>/dev/null || true
-  fi
-}
-trap cleanup EXIT
-
-for BACK in "${BACKENDS[@]}"; do
-  VLLM_DEEP_GEMM_WARMUP=skip \
-  VLLM_ALL2ALL_BACKEND=$BACK \
-  vllm serve "$MODEL" \
-    --enforce-eager \
-    --tensor-parallel-size 2 \
-    --data-parallel-size 2 \
-    --enable-expert-parallel \
-    --enable-eplb \
-    --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
-    --trust-remote-code \
-    --max-model-len 2048 \
-    --port $PORT &
-  SERVER_PID=$!
-  wait_for_server $PORT
-
-  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
-  OUT="${OUT_DIR}/${TAG}_${BACK}_async_eplb.json"
-  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
-  python3 - <<PY
-import json; acc=json.load(open('${OUT}'))['accuracy']
-print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
-assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
-PY
-
-  cleanup
-  SERVER_PID=
-  sleep 1
-  PORT=$((PORT+1))
-done
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@ -50,7 +50,6 @@ for BACK in "${BACKENDS[@]}"; do
    --data-parallel-size 2 \
    --enable-expert-parallel \
    --enable-eplb \
-    --eplb-config '{"window_size":200,"step_interval":600}' \
    --trust-remote-code \
    --max-model-len 2048 \
    --port $PORT &
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@ -1,74 +0,0 @@
-#!/usr/bin/env bash
-set -euxo pipefail
-
-# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
-THRESHOLD=${1:-0.25}
-NUM_Q=${2:-1319}
-PORT=${3:-8040}
-OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
-mkdir -p "${OUT_DIR}"
-
-wait_for_server() {
-  local port=$1
-  timeout 600 bash -c '
-    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
-      sleep 1
-    done'
-}
-
-MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
-
-# Set BACKENDS based on platform
-if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
-  # ROCm platform
-  BACKENDS=("allgather_reducescatter")
-  # Disable MOE padding for ROCm since it is causing eplb to fail
-  export VLLM_ROCM_MOE_PADDING=0
-else
-  # Non-ROCm platform (CUDA/other)
-  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
-fi
-
-cleanup() {
-  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
-    kill "${SERVER_PID}" 2>/dev/null || true
-    for _ in {1..20}; do
-      kill -0 "${SERVER_PID}" 2>/dev/null || break
-      sleep 0.5
-    done
-    kill -9 "${SERVER_PID}" 2>/dev/null || true
-  fi
-}
-trap cleanup EXIT
-
-for BACK in "${BACKENDS[@]}"; do
-  VLLM_DEEP_GEMM_WARMUP=skip \
-  VLLM_ALL2ALL_BACKEND=$BACK \
-  vllm serve "$MODEL" \
-    --enforce-eager \
-    --tensor-parallel-size 4 \
-    --enable-expert-parallel \
-    --enable-eplb \
-    --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
-    --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
-    --trust-remote-code \
-    --max-model-len 2048 \
-    --gpu-memory-utilization 0.9 \
-    --port $PORT &
-  SERVER_PID=$!
-  wait_for_server $PORT
-
-  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
-  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
-  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
-  python3 - <<PY
-import json; acc=json.load(open('${OUT}'))['accuracy']
-print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
-assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
-PY
-
-  cleanup
-  SERVER_PID=
-  sleep 1
-  PORT=$((PORT+1))
-done
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@ -34,9 +34,10 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
 fi
 wheel="${wheel_files[0]}"

-# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
+# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31
+# we also accept params as manylinux tag
 # refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
-manylinux_version="manylinux_2_31"
+manylinux_version="${1:-manylinux_2_31}"

 # Rename 'linux' to the appropriate manylinux version in the wheel filename
 if [[ "$wheel" != *"linux"* ]]; then
@ -96,8 +97,11 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]];
    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
 fi

-# copy to /<pure_version>/ only if it does not have "dev" in the version
+# re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
 if [[ "$version" != *"dev"* ]]; then
-    echo "Uploading indices to overwrite /$pure_version/"
+    echo "Re-generating indices for /$pure_version/"
+    rm -rf "$INDICES_OUTPUT_DIR/*"
+    mkdir -p "$INDICES_OUTPUT_DIR"
+    $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
 fi
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@ -398,7 +398,8 @@ steps:
  timeout_in_minutes: 25
  gpu: h100
  source_file_dependencies:
-    - vllm/
+    - vllm/v1/attention
+    - vllm/model_executor/layers
    - tests/v1/determinism/
  commands:
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
@ -440,23 +441,29 @@ steps:
  working_dir: "/vllm-workspace/examples"
  source_file_dependencies:
  - vllm/entrypoints
+  - vllm/multimodal
  - examples/
  commands:
    - pip install tensorizer # for tensorizer test
+    # for basic
+    - python3 offline_inference/basic/chat.py
    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/chat.py
-    - python3 offline_inference/prefix_caching.py
-    - python3 offline_inference/llm_engine_example.py
+    - python3 offline_inference/basic/classify.py
+    - python3 offline_inference/basic/embed.py
+    - python3 offline_inference/basic/score.py
+    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_pooling.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
-    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
-    - python3 offline_inference/basic/classify.py
-    - python3 offline_inference/basic/embed.py
-    - python3 offline_inference/basic/score.py
+    # for pooling models
+    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    # for features demo
+    - python3 offline_inference/prefix_caching.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@ -718,6 +725,18 @@ steps:
  - uv pip install --system conch-triton-kernels
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py

+- label: LM Eval Small Models # 53min
+  timeout_in_minutes: 75
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  autorun_on_main: true
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+
 - label: OpenAI API correctness # 10min
  timeout_in_minutes: 15
  mirror_hardwares: [amdexperimental, amdproduction]
@ -727,7 +746,7 @@ steps:
  - csrc/
  - vllm/entrypoints/openai/
  - vllm/model_executor/models/whisper.py
-  commands: # LMEval
+  commands: # LMEval+Transcription WER check
  # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
  - pytest -s entrypoints/openai/correctness/

@ -963,6 +982,19 @@ steps:
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work

+- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - vllm/multimodal/
+  - vllm/inputs/
+  - vllm/v1/core/
+  commands:
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+
 - label: Multi-Modal Models Test (Extended) 1 # 60min
  timeout_in_minutes: 120
  mirror_hardwares: [amdexperimental]
@ -1098,7 +1130,6 @@ steps:
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - vllm/model_executor/layers/fused_moe/layer.py
  - tests/compile/test_fusion_attn.py
  - tests/compile/test_silu_mul_quant_fusion.py
  - tests/compile/distributed/test_fusion_all_reduce.py
@ -1132,12 +1163,25 @@ steps:
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
  - tests/compile/distributed/test_fusions_e2e.py
-  - tests/compile/fullgraph/test_full_graph.py
  commands:
    - nvidia-smi
    # Run all e2e fusion tests
    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py

+- label: Blackwell GPT-OSS Eval
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true # run on nightlies
+  source_file_dependencies:
+  - tests/evals/gpt_oss
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - uv pip install --system 'gpt-oss[eval]==0.0.5'
+    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
 - label: Blackwell Quantized MoE Test
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
@ -1155,6 +1199,16 @@ steps:
  commands:
    - pytest -s -v tests/quantization/test_blackwell_moe.py

+- label: Blackwell LM Eval Small Models
+  timeout_in_minutes: 120
+  gpu: b200
+  optional: true # run on nightlies
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+
 #####  1 GPU test  #####
 #####  multi gpus test  #####

@ -1397,6 +1451,39 @@ steps:
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py

+
+- label: LM Eval Large Models # optional
+  gpu: a100
+  optional: true
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+##### H100 test #####
+- label: LM Eval Large Models (H100) # optional
+  gpu: h100
+  optional: true
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
+
 ##### H200 test #####
 - label: Distributed Tests (H200) # optional
  mirror_hardwares: [amdexperimental]
@ -1440,29 +1527,6 @@ steps:
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1

- label: Blackwell LM Eval Small Models
-  timeout_in_minutes: 120
-  gpu: b200
-  optional: true # run on nightlies
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
-
- label: Multi-Modal Accuracy Eval (Small Models) # 10min
-  timeout_in_minutes: 70
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - vllm/multimodal/
-  - vllm/inputs/
-  - vllm/v1/core/
-  commands:
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
-
 - label: LM Eval Large Models (4 Card)
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
@ -1478,21 +1542,6 @@ steps:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

- label: LM Eval Large Models (H100) # optional
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_4
-  # grade: Blocking
-  gpu: h100
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
-    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
-
 - label: ROCm LM Eval Large Models (8 Card)
  mirror_hardwares: [amdproduction]
  agent_pool: mi325_8
@ -1517,6 +1566,20 @@ steps:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
    - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58

+##### RL Integration Tests #####
+- label: Prime-RL Integration Test # 15min
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_2
+  # grade: Blocking
+  timeout_in_minutes: 30
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  source_file_dependencies:
+  - vllm/
+  - .buildkite/scripts/run-prime-rl-test.sh
+  commands:
+    - bash .buildkite/scripts/run-prime-rl-test.sh
 - label: DeepSeek V2-Lite Accuracy
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
@ -1550,17 +1613,26 @@ steps:
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1

-##### RL Integration Tests #####
- label: Prime-RL Integration Test # 15min
+- label: DeepSeek V2-Lite Async EPLB Accuracy
+  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_2
+  agent_pool: mi325_4
  # grade: Blocking
-  timeout_in_minutes: 30
+  gpu: h100
  optional: true
-  num_gpus: 2
+  num_gpus: 4
  working_dir: "/vllm-workspace"
-  source_file_dependencies:
-  - vllm/
-  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
-    - bash .buildkite/scripts/run-prime-rl-test.sh
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
+
+- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -468,7 +468,9 @@ steps:
  # tests covered elsewhere.
  # Use `find` to launch multiple instances of pytest so that
  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
-  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
+  # However, find does not normally propagate error codes, so we combine it with xargs
+  # (using -0 for proper path handling)
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"

 - label: PyTorch Fullgraph Smoke Test # 15min
  timeout_in_minutes: 30
@ -482,7 +484,9 @@ steps:
  # as it is a heavy test that is covered in other steps.
  # Use `find` to launch multiple instances of pytest so that
  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
-  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
+  # However, find does not normally propagate error codes, so we combine it with xargs
+  # (using -0 for proper path handling)
+  - "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"

 - label: PyTorch Fullgraph Test # 27min
  timeout_in_minutes: 40
@ -1375,22 +1379,4 @@ steps:
  num_gpus: 2
  working_dir: "/vllm-workspace"
  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
-
- label: DeepSeek V2-Lite Async EPLB Accuracy
-  timeout_in_minutes: 60
-  gpu: h100
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace"
-  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
-
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
-  timeout_in_minutes: 60
-  gpu: h100
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace"
-  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
--- a/.buildkite/test_areas/attention.yaml
+++ b/.buildkite/test_areas/attention.yaml
@ -0,0 +1,21 @@
+group: Attention
+depends_on: 
+  - image-build
+steps:
+- label: V1 attention (H100)
+  timeout_in_minutes: 30
+  gpu: h100
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - pytest -v -s v1/attention
+
+- label: V1 attention (B200)
+  timeout_in_minutes: 30
+  gpu: b200
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
--- a/.buildkite/test_areas/basic_correctness.yaml
+++ b/.buildkite/test_areas/basic_correctness.yaml
@ -0,0 +1,16 @@
+group: Basic Correctness
+depends_on: 
+  - image-build
+steps:
+- label: Basic Correctness
+  timeout_in_minutes: 30
+  source_file_dependencies:
+  - vllm/
+  - tests/basic_correctness/test_basic_correctness
+  - tests/basic_correctness/test_cpu_offload
+  - tests/basic_correctness/test_cumem.py
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s basic_correctness/test_cumem.py
+  - pytest -v -s basic_correctness/test_basic_correctness.py
+  - pytest -v -s basic_correctness/test_cpu_offload.py
--- a/.buildkite/test_areas/benchmarks.yaml
+++ b/.buildkite/test_areas/benchmarks.yaml
@ -0,0 +1,19 @@
+group: Benchmarks
+depends_on: 
+  - image-build
+steps:
+- label: Benchmarks
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/.buildkite"
+  source_file_dependencies:
+  - benchmarks/
+  commands:
+  - bash scripts/run-benchmarks.sh
+
+- label: Benchmarks CLI Test
+  timeout_in_minutes: 20
+  source_file_dependencies:
+  - vllm/
+  - tests/benchmarks/
+  commands:
+  - pytest -v -s benchmarks/
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@ -0,0 +1,57 @@
+group: Compile
+depends_on: 
+  - image-build
+steps:
+- label: Fusion and Compile Tests (B200)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusion_attn.py
+  - tests/compile/test_silu_mul_quant_fusion.py
+  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  - tests/compile/fullgraph/test_full_graph.py
+  commands:
+    - nvidia-smi
+    - pytest -v -s tests/compile/test_fusion_attn.py
+    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    # this runner has 2 GPUs available even though num_gpus=2 is not set
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # Wrap with quotes to escape yaml
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+
+- label: Fusion E2E (2 GPUs)(B200)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  commands:
+    - nvidia-smi
+    # Run all e2e fusion tests
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
+
--- a/.buildkite/test_areas/cuda.yaml
+++ b/.buildkite/test_areas/cuda.yaml
@ -0,0 +1,22 @@
+group: CUDA
+depends_on: 
+  - image-build
+steps:
+- label: Platform Tests (CUDA)
+  timeout_in_minutes: 15
+  source_file_dependencies:
+  - vllm/
+  - tests/cuda
+  commands:
+    - pytest -v -s cuda/test_cuda_context.py
+
+- label: Cudagraph
+  timeout_in_minutes: 20
+  source_file_dependencies:
+  - tests/v1/cudagraph
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/config/compilation.py
+  - vllm/compilation
+  commands:
+    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@ -0,0 +1,199 @@
+group: Distributed
+depends_on: 
+  - image-build
+steps:
+- label: Distributed Comm Ops
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/distributed
+  - tests/distributed
+  commands:
+  - pytest -v -s distributed/test_comm_ops.py
+  - pytest -v -s distributed/test_shm_broadcast.py
+  - pytest -v -s distributed/test_shm_buffer.py
+  - pytest -v -s distributed/test_shm_storage.py
+
+- label: Distributed (2 GPUs)
+  timeout_in_minutes: 90
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/compilation/
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/worker/worker_base.py
+  - vllm/v1/engine/
+  - vllm/v1/worker/
+  - tests/compile/fullgraph/test_basic_correctness.py
+  - tests/compile/test_wrapper.py
+  - tests/distributed/
+  - tests/entrypoints/llm/test_collective_rpc.py
+  - tests/v1/distributed
+  - tests/v1/entrypoints/openai/test_multi_api_servers.py
+  - tests/v1/shutdown
+  - tests/v1/worker/test_worker_memory_snapshot.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
+  - pytest -v -s entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
+  - pytest -v -s ./compile/test_wrapper.py
+  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - pytest -v -s distributed/test_sequence_parallel.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
+
+- label: Distributed Tests (4 GPUs)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - tests/distributed/test_utils
+  - tests/distributed/test_pynccl
+  - tests/distributed/test_events
+  - tests/compile/fullgraph/test_basic_correctness.py
+  - examples/offline_inference/rlhf.py
+  - examples/offline_inference/rlhf_colocate.py
+  - tests/examples/offline_inference/data_parallel.py
+  - tests/v1/distributed
+  - tests/v1/engine/test_engine_core_client.py
+  - tests/distributed/test_symm_mem_allreduce.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  # test with torchrun tp=2 and external_dp=2
+  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with torchrun tp=2 and pp=2
+  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with torchrun tp=4 and dp=1
+  - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=2, pp=2 and dp=1
+  - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=1 and dp=4 with ep
+  - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=2 and dp=2 with ep
+  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with internal dp
+  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
+  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
+  - pytest -v -s distributed/test_utils.py
+  - pytest -v -s compile/fullgraph/test_basic_correctness.py
+  - pytest -v -s distributed/test_pynccl.py
+  - pytest -v -s distributed/test_events.py
+  - pytest -v -s distributed/test_symm_mem_allreduce.py
+  # TODO: create a dedicated test section for multi-GPU example tests
+  # when we have multiple distributed example tests
+  - cd ../examples/offline_inference
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+
+- label: Distributed Tests (8 GPUs)(H100)
+  timeout_in_minutes: 10
+  gpu: h100
+  num_gpus: 8
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - examples/offline_inference/torchrun_dp_example.py
+  - vllm/config/parallel.py
+  - vllm/distributed/
+  - vllm/v1/engine/llm_engine.py
+  - vllm/v1/executor/uniproc_executor.py
+  - vllm/v1/worker/gpu_worker.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  # test with torchrun tp=2 and dp=4 with ep
+  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+
+- label: Distributed Tests (4 GPUs)(A100)
+  gpu: a100
+  optional: true
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/
+  commands:
+  # NOTE: don't test llama model here, it seems hf implementation is buggy
+  # see https://github.com/vllm-project/vllm/pull/5689 for details
+  - pytest -v -s distributed/test_custom_all_reduce.py
+  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
+  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+  - pytest -v -s -x lora/test_mixtral.py
+
+- label: Distributed Tests (2 GPUs)(H200)
+  gpu: h200
+  optional: true
+  working_dir: "/vllm-workspace/"
+  num_gpus: 2
+  commands:
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
+    - pytest -v -s tests/distributed/test_context_parallel.py
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - pytest -v -s tests/v1/distributed/test_dbo.py
+
+- label: Distributed Tests (2 GPUs)(B200)
+  gpu: b200
+  optional: true
+  working_dir: "/vllm-workspace/"
+  num_gpus: 2
+  commands:
+    - pytest -v -s tests/distributed/test_context_parallel.py
+    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+    - pytest -v -s tests/v1/distributed/test_dbo.py
+
+- label: 2 Node Test (4 GPUs)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  num_nodes: 2
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
+  - tests/examples/offline_inference/data_parallel.py
+  commands:
+    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
+
+- label: Distributed NixlConnector PD accuracy (4 GPUs)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+
+- label: Pipeline + Context Parallelism (4 GPUs))
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
+  commands:
+  - pytest -v -s distributed/test_pp_cudagraph.py
+  - pytest -v -s distributed/test_pipeline_parallel.py
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@ -0,0 +1,59 @@
+group: E2E Integration
+depends_on: 
+  - image-build
+steps:
+- label: DeepSeek V2-Lite Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+  timeout_in_minutes: 60
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+
+- label: Prime-RL Integration (2 GPUs)
+  timeout_in_minutes: 30
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  source_file_dependencies:
+  - vllm/
+  - .buildkite/scripts/run-prime-rl-test.sh
+  commands:
+    - bash .buildkite/scripts/run-prime-rl-test.sh
+
+- label: DeepSeek V2-Lite Async EPLB Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
+
+- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@ -0,0 +1,26 @@
+group: Engine
+depends_on: 
+  - image-build
+steps:
+- label: Engine
+  timeout_in_minutes: 15
+  source_file_dependencies:
+  - vllm/
+  - tests/engine
+  - tests/test_sequence
+  - tests/test_config
+  - tests/test_logger
+  - tests/test_vllm_port
+  commands:
+  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
+
+- label: V1 e2e + engine
+  timeout_in_minutes: 45
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    # TODO: accuracy does not match, whether setting
+    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+    - pytest -v -s v1/e2e
+    - pytest -v -s v1/engine
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@ -0,0 +1,68 @@
+group: Entrypoints
+depends_on: 
+  - image-build
+steps:
+- label: Entrypoints Unit Tests  
+  timeout_in_minutes: 10
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/entrypoints
+  - tests/entrypoints/
+  commands:
+  - pytest -v -s entrypoints/openai/tool_parsers
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+
+- label: Entrypoints Integration (LLM)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/llm
+  - tests/entrypoints/offline_mode
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
+  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+
+- label: Entrypoints Integration (API Server)
+  timeout_in_minutes: 130
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/openai
+  - tests/entrypoints/test_chat_utils
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+  - pytest -v -s entrypoints/test_chat_utils.py
+
+
+- label: Entrypoints Integration (Pooling)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/pooling
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/pooling
+
+
+- label: Entrypoints V1
+  timeout_in_minutes: 50
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    - pytest -v -s v1/entrypoints
+
+- label: OpenAI API Correctness
+  timeout_in_minutes: 30
+  source_file_dependencies:
+  - csrc/
+  - vllm/entrypoints/openai/
+  - vllm/model_executor/models/whisper.py
+  commands: # LMEval+Transcription WER check
+  - pytest -s entrypoints/openai/correctness/
--- a/.buildkite/test_areas/expert_parallelism.yaml
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@ -0,0 +1,23 @@
+group: Expert Parallelism
+depends_on: 
+  - image-build
+steps:
+- label: EPLB Algorithm
+  timeout_in_minutes: 15
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/distributed/eplb
+  - tests/distributed/test_eplb_algo.py
+  commands:
+  - pytest -v -s distributed/test_eplb_algo.py
+
+- label: EPLB Execution
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/eplb
+  - tests/distributed/test_eplb_execute.py
+  commands:
+  - pytest -v -s distributed/test_eplb_execute.py
+  - pytest -v -s distributed/test_eplb_spec_decode.py
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@ -0,0 +1,117 @@
+group: Kernels
+depends_on: 
+  - image-build
+steps:
+- label: Kernels Core Operation Test
+  timeout_in_minutes: 75
+  source_file_dependencies:
+  - csrc/
+  - tests/kernels/core
+  - tests/kernels/test_top_k_per_row.py
+  commands:
+    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+
+- label: Kernels Attention Test %N
+  timeout_in_minutes: 35
+  source_file_dependencies:
+  - csrc/attention/
+  - vllm/attention
+  - vllm/v1/attention
+  - tests/kernels/attention
+  commands:
+    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels Quantization Test %N
+  timeout_in_minutes: 90
+  source_file_dependencies:
+  - csrc/quantization/
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization
+  commands:
+    - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels MoE Test %N
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - csrc/quantization/cutlass_w8a8/moe/
+  - csrc/moe/
+  - tests/kernels/moe
+  - vllm/model_executor/layers/fused_moe/
+  - vllm/distributed/device_communicators/
+  - vllm/envs.py
+  - vllm/config
+  commands:
+    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels Mamba Test
+  timeout_in_minutes: 45
+  source_file_dependencies:
+  - csrc/mamba/
+  - tests/kernels/mamba
+  - vllm/model_executor/layers/mamba/ops
+  commands:
+    - pytest -v -s kernels/mamba
+
+- label: Kernels DeepGEMM Test (H100)
+  timeout_in_minutes: 45
+  gpu: h100
+  num_gpus: 1
+  source_file_dependencies:
+  - tools/install_deepgemm.sh
+  - vllm/utils/deep_gemm.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization/test_block_fp8.py
+  - tests/kernels/moe/test_deepgemm.py
+  - tests/kernels/moe/test_batched_deepgemm.py
+  - tests/kernels/attention/test_deepgemm_attention.py
+  commands:
+    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+    - pytest -v -s kernels/moe/test_deepgemm.py
+    - pytest -v -s kernels/moe/test_batched_deepgemm.py
+    - pytest -v -s kernels/attention/test_deepgemm_attention.py
+
+- label: Kernels (B200)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  # optional: true
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - csrc/attention/mla/
+  - csrc/quantization/cutlass_w8a8/moe/
+  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/attention/backends/mla/cutlass_mla.py
+  - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/platforms/cuda.py
+  - vllm/attention/selector.py
+  commands:
+    - nvidia-smi
+    - python3 examples/offline_inference/basic/chat.py
+    # Attention
+    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+    - pytest -v -s tests/kernels/attention/test_attention_selector.py
+    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
+    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
+    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+    - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
+    # Quantization
+    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+    - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
+    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
+    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
+    - pytest -v -s tests/kernels/moe/test_flashinfer.py
+    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@ -0,0 +1,46 @@
+group: LM Eval
+depends_on: 
+  - image-build
+steps:
+- label: LM Eval Small Models
+  timeout_in_minutes: 75
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  autorun_on_main: true
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+
+- label: LM Eval Large Models (4 GPUs)(A100)
+  gpu: a100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+- label: LM Eval Large Models (4 GPUs)(H100)
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
+- label: LM Eval Small Models (B200)
+  timeout_in_minutes: 120
+  gpu: b200
+  optional: true
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
--- a/.buildkite/test_areas/lora.yaml
+++ b/.buildkite/test_areas/lora.yaml
@ -0,0 +1,31 @@
+group: LoRA
+depends_on: 
+  - image-build
+steps:
+- label: LoRA %N
+  timeout_in_minutes: 30
+  source_file_dependencies:
+  - vllm/lora
+  - tests/lora
+  commands:
+    - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
+  parallelism: 4
+
+
+- label: LoRA TP (Distributed)
+  timeout_in_minutes: 30
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/lora
+  - tests/lora
+  commands:
+    # FIXIT: find out which code initialize cuda before running the test
+    # before the fix, we need to use spawn to test it
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    # There is some Tensor Parallelism related processing logic in LoRA that
+    # requires multi-GPU testing for validation.
+    - pytest -v -s -x lora/test_chatglm3_tp.py
+    - pytest -v -s -x lora/test_llama_tp.py
+    - pytest -v -s -x lora/test_llm_with_multi_loras.py
+    - pytest -v -s -x lora/test_olmoe_tp.py
+    - pytest -v -s -x lora/test_gptoss_tp.py
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@ -0,0 +1,163 @@
+group: Miscellaneous
+depends_on: 
+  - image-build
+steps:
+- label: V1 Others
+  timeout_in_minutes: 60
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    # split the test to avoid interference
+    - pytest -v -s -m 'not cpu_test' v1/core
+    - pytest -v -s v1/executor
+    - pytest -v -s v1/kv_offload
+    - pytest -v -s v1/sample
+    - pytest -v -s v1/logits_processors
+    - pytest -v -s v1/worker
+    - pytest -v -s v1/spec_decode
+    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+    - pytest -v -s -m 'not cpu_test' v1/metrics
+    - pytest -v -s v1/test_oracle.py
+    - pytest -v -s v1/test_request.py
+    - pytest -v -s v1/test_outputs.py
+    # Integration test for streaming correctness (requires special branch).
+    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
+    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+
+- label: V1 Others (CPU)
+  depends_on: ~
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  no_gpu: true
+  commands:
+    # split the test to avoid interference
+    - pytest -v -s -m 'cpu_test' v1/core
+    - pytest -v -s v1/structured_output
+    - pytest -v -s v1/test_serial_utils.py
+    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
+    - pytest -v -s -m 'cpu_test' v1/metrics
+
+- label: Regression
+  timeout_in_minutes: 20
+  source_file_dependencies:
+  - vllm/
+  - tests/test_regression
+  commands:
+  - pip install modelscope
+  - pytest -v -s test_regression.py
+  working_dir: "/vllm-workspace/tests" # optional
+
+- label: Examples
+  timeout_in_minutes: 45
+  working_dir: "/vllm-workspace/examples"
+  source_file_dependencies:
+  - vllm/entrypoints
+  - vllm/multimodal
+  - examples/
+  commands:
+    - pip install tensorizer # for tensorizer test
+    - python3 offline_inference/basic/chat.py # for basic
+    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 offline_inference/basic/classify.py
+    - python3 offline_inference/basic/embed.py
+    - python3 offline_inference/basic/score.py
+    # for multi-modal models
+    - python3 offline_inference/audio_language.py --seed 0
+    - python3 offline_inference/vision_language.py --seed 0
+    - python3 offline_inference/vision_language_multi_image.py --seed 0
+    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+     # for pooling models
+    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    # for features demo
+    - python3 offline_inference/prefix_caching.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
+    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+
+- label: Metrics, Tracing (2 GPUs)
+  timeout_in_minutes: 20
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/
+  - tests/v1/tracing
+  commands:
+  - "pip install \
+      'opentelemetry-sdk>=1.26.0' \
+      'opentelemetry-api>=1.26.0' \
+      'opentelemetry-exporter-otlp>=1.26.0' \
+      'opentelemetry-semantic-conventions-ai>=0.4.1'"
+  - pytest -v -s v1/tracing
+
+- label: Python-only Installation
+  depends_on: ~
+  timeout_in_minutes: 20
+  source_file_dependencies:
+  - tests/standalone_tests/python_only_compile.sh
+  - setup.py
+  commands:
+  - bash standalone_tests/python_only_compile.sh
+
+- label: Async Engine, Inputs, Utils, Worker
+  timeout_in_minutes: 50
+  source_file_dependencies:
+  - vllm/
+  - tests/multimodal
+  - tests/utils_
+  commands:
+  - pytest -v -s -m 'not cpu_test' multimodal
+  - pytest -v -s utils_
+
+- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
+  depends_on: ~
+  timeout_in_minutes: 20
+  source_file_dependencies:
+  - vllm/
+  - tests/test_inputs.py
+  - tests/test_outputs.py
+  - tests/multimodal
+  - tests/standalone_tests/lazy_imports.py
+  - tests/tokenizers_
+  - tests/transformers_utils
+  - tests/config
+  no_gpu: true
+  commands:
+  - python3 standalone_tests/lazy_imports.py
+  - pytest -v -s test_inputs.py
+  - pytest -v -s test_outputs.py
+  - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s tokenizers_
+  - pytest -v -s transformers_utils
+  - pytest -v -s config
+
+- label: GPT-OSS Eval (B200)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  source_file_dependencies:
+  - tests/evals/gpt_oss
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - uv pip install --system 'gpt-oss[eval]==0.0.5'
+    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
+- label: Batch Invariance (H100)
+  timeout_in_minutes: 25
+  gpu: h100
+  source_file_dependencies:
+    - vllm/v1/attention
+    - vllm/model_executor/layers
+    - tests/v1/determinism/
+  commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pip install pytest-timeout pytest-forked
+    - pytest -v -s v1/determinism/test_batch_invariance.py
+    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
--- a/.buildkite/test_areas/model_executor.yaml
+++ b/.buildkite/test_areas/model_executor.yaml
@ -0,0 +1,17 @@
+group: Model Executor
+depends_on: 
+  - image-build
+steps:
+- label: Model Executor
+  timeout_in_minutes: 35
+  source_file_dependencies:
+  - vllm/engine/arg_utils.py
+  - vllm/config/model.py
+  - vllm/model_executor
+  - tests/model_executor
+  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
+  commands:
+    - apt-get update && apt-get install -y curl libsodium23
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s model_executor
+    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@ -0,0 +1,62 @@
+group: Models - Basic
+depends_on: 
+  - image-build
+steps:
+- label: Basic Models Tests (Initialization)
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/test_initialization.py
+  commands:
+    # Run a subset of model initialization tests
+    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
+
+- label: Basic Models Tests (Extra Initialization) %N
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/model_executor/models/
+  - tests/models/test_initialization.py
+  commands:
+    # Only when vLLM model source is modified - test initialization of a large
+    # subset of supported models (the complement of the small subset in the above
+    # test.) Also run if model initialization test file is modified
+    - pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+
+- label: Basic Models Tests (Other)
+  timeout_in_minutes: 45
+  source_file_dependencies:
+  - vllm/
+  - tests/models/test_transformers.py
+  - tests/models/test_registry.py
+  commands:
+    - pytest -v -s models/test_transformers.py models/test_registry.py
+
+- label: Basic Models Test (Other CPU) # 5min
+  timeout_in_minutes: 10
+  source_file_dependencies:
+  - vllm/
+  - tests/models/test_utils.py
+  - tests/models/test_vision.py
+  no_gpu: true
+  commands:
+    - pytest -v -s models/test_utils.py models/test_vision.py
+
+- label: Transformers Nightly Models
+  working_dir: "/vllm-workspace/"
+  optional: true
+  soft_fail: true
+  commands:
+    - pip install --upgrade git+https://github.com/huggingface/transformers
+    - pytest -v -s tests/models/test_initialization.py
+    - pytest -v -s tests/models/test_transformers.py
+    - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/test_mapping.py
+    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    # Whisper needs spawn method to avoid deadlock
+    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
--- a/.buildkite/test_areas/models_distributed.yaml
+++ b/.buildkite/test_areas/models_distributed.yaml
@ -0,0 +1,22 @@
+group: Models - Distributed
+depends_on: 
+  - image-build
+steps:
+- label: Distributed Model Tests (2 GPUs)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/model_executor/model_loader/sharded_state_loader.py
+  - vllm/model_executor/models/
+  - tests/basic_correctness/
+  - tests/model_executor/model_loader/test_sharded_state_loader.py
+  - tests/models/
+  commands:
+  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
+  # Avoid importing model tests that cause CUDA reinitialization error
+  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
--- a/.buildkite/test_areas/models_language.yaml
+++ b/.buildkite/test_areas/models_language.yaml
@ -0,0 +1,91 @@
+group: Models - Language
+depends_on: 
+  - image-build
+steps:
+- label: Language Models Tests (Standard)
+  timeout_in_minutes: 25
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language
+  commands:
+    # Test standard language models, excluding a subset of slow tests
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/language -m 'core_model and (not slow_test)'
+
+- label: Language Models Tests (Extra Standard) %N
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/model_executor/models/
+  - tests/models/language/pooling/test_embedding.py
+  - tests/models/language/generation/test_common.py
+  - tests/models/language/pooling/test_classification.py
+  commands:
+    # Shard slow subset of standard language models tests. Only run when model
+    # source is modified, or when specified test files are modified
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+
+- label: Language Models Tests (Hybrid) %N
+  timeout_in_minutes: 75
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation
+  commands:
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+    # Shard hybrid language model tests
+    - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+
+- label: Language Models Test (Extended Generation) # 80min
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation
+  commands:
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+
+- label: Language Models Test (PPL)
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation_ppl_test
+  commands:
+    - pytest -v -s models/language/generation_ppl_test
+
+- label: Language Models Test (Extended Pooling)  # 36min
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/pooling
+  commands:
+    - pytest -v -s models/language/pooling -m 'not core_model'
+
+- label: Language Models Test (MTEB)
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/pooling_mteb_test
+  commands:
+    - pytest -v -s models/language/pooling_mteb_test
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@ -0,0 +1,79 @@
+group: Models - Multimodal
+depends_on: 
+  - image-build
+steps:
+- label: Multi-Modal Models (Standard) # 60min
+  timeout_in_minutes: 80
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
+
+- label: Multi-Modal Processor Test (CPU)
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  no_gpu: true
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Processor # 44min
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Accuracy Eval (Small Models) # 50min
+  timeout_in_minutes: 70
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - vllm/multimodal/
+  - vllm/inputs/
+  - vllm/v1/core/
+  commands:
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+
+- label: Multi-Modal Models (Extended) 1
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
+
+- label: Multi-Modal Models (Extended) 2
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+
+- label: Multi-Modal Models (Extended) 3
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+
+# This test is used only in PR development phase to test individual models and should never run on main
+- label: Custom Models
+  optional: true
+  commands:
+    - echo 'Testing custom models...'
+    # PR authors can temporarily add commands below to test individual models
+    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
+    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
--- a/.buildkite/test_areas/plugins.yaml
+++ b/.buildkite/test_areas/plugins.yaml
@ -0,0 +1,34 @@
+group: Plugins
+depends_on: 
+  - image-build
+steps:
+- label: Plugin Tests (2 GPUs)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/plugins/
+  - tests/plugins/
+  commands:
+  # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
+  - pip install -e ./plugins/vllm_add_dummy_platform
+  - pytest -v -s plugins_tests/test_platform_plugins.py
+  - pip uninstall vllm_add_dummy_platform -y
+  # end platform plugin tests
+  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+  - pip install -e ./plugins/prithvi_io_processor_plugin
+  - pytest -v -s plugins_tests/test_io_processor_plugins.py
+  - pip uninstall prithvi_io_processor_plugin -y
+  # end io_processor plugins test
+  # begin stat_logger plugins test
+  - pip install -e ./plugins/vllm_add_dummy_stat_logger
+  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
+  - pip uninstall dummy_stat_logger -y
+  # end stat_logger plugins test
+  # other tests continue here:
+  - pytest -v -s plugins_tests/test_scheduler_plugins.py
+  - pip install -e ./plugins/vllm_add_dummy_model
+  - pytest -v -s distributed/test_distributed_oot.py
+  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+  - pytest -v -s models/test_oot_registration.py # it needs a clean process
+  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
--- a/.buildkite/test_areas/pytorch.yaml
+++ b/.buildkite/test_areas/pytorch.yaml
@ -0,0 +1,50 @@
+group: PyTorch
+depends_on: 
+  - image-build
+steps:
+- label: PyTorch Compilation Unit Tests
+  timeout_in_minutes: 30
+  source_file_dependencies:
+    - vllm/
+    - tests/compile
+  commands:
+  # Run unit tests defined directly under compile/,
+  # not including subdirectories, which are usually heavier
+  # tests covered elsewhere.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;"
+
+- label: PyTorch Fullgraph Smoke Test
+  timeout_in_minutes: 30
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+  # Run smoke tests under fullgraph directory, except test_full_graph.py
+  # as it is a heavy test that is covered in other steps.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
+
+- label: PyTorch Fullgraph
+  timeout_in_minutes: 40
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+    # fp8 kv scales not supported on sm89, tested on Blackwell instead
+  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+    # Limit to no custom ops to reduce running time
+    # Wrap with quotes to escape yaml and avoid starting -k string with a -
+  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+
+- label: Pytorch Nightly Dependency Override Check # 2min
+  # if this test fails, it means the nightly torch version is not compatible with some
+  # of the dependencies. Please check the error message and add the package to whitelist
+  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
+  soft_fail: true
+  source_file_dependencies:
+  - requirements/nightly_torch_test.txt
+  commands:
+  - bash standalone_tests/pytorch_nightly_dependency.sh
--- a/.buildkite/test_areas/quantization.yaml
+++ b/.buildkite/test_areas/quantization.yaml
@ -0,0 +1,46 @@
+group: Quantization
+depends_on: 
+  - image-build
+steps:
+- label: Quantization
+  timeout_in_minutes: 90
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  - tests/quantization
+  commands:
+  # temporary install here since we need nightly, will move to requirements/test.in
+  # after torchao 0.12 release, and pin a working version of torchao nightly here
+
+  # since torchao nightly is only compatible with torch nightly currently
+  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
+  # we can only upgrade after this is resolved
+  # TODO(jerryzh168): resolve the above comment
+  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system conch-triton-kernels
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
+
+- label: Quantized MoE Test (B200)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - tests/quantization/test_blackwell_moe.py
+  - vllm/model_executor/models/deepseek_v2.py
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/models/llama4.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization/compressed_tensors
+  - vllm/model_executor/layers/quantization/modelopt.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - pytest -s -v tests/quantization/test_blackwell_moe.py
+
+- label: Quantized Models Test
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - vllm/model_executor/layers/quantization
+  - tests/models/quantization
+  commands:
+    - pytest -v -s models/quantization
--- a/.buildkite/test_areas/samplers.yaml
+++ b/.buildkite/test_areas/samplers.yaml
@ -0,0 +1,14 @@
+group: Samplers
+depends_on: 
+  - image-build
+steps:
+- label: Samplers Test
+  timeout_in_minutes: 75
+  source_file_dependencies:
+  - vllm/model_executor/layers
+  - vllm/sampling_metadata.py
+  - tests/samplers
+  - tests/conftest.py
+  commands:
+    - pytest -v -s samplers
+    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
--- a/.buildkite/test_areas/tool_use.yaml
+++ b/.buildkite/test_areas/tool_use.yaml
@ -0,0 +1,23 @@
+group: Tool use
+depends_on: 
+  - image-build
+steps:
+- label: OpenAI-Compatible Tool Use
+  timeout_in_minutes: 35
+  mirror_hardwares: [amdexperimental]
+  fast_check: false
+  source_file_dependencies:
+    - vllm/
+    - tests/tool_use
+  commands:
+    - pytest -v -s -m 'not cpu_test' tool_use
+
+- label: OpenAI-Compatible Tool Use (CPU)
+  depends_on: ~
+  timeout_in_minutes: 10
+  source_file_dependencies:
+    - vllm/
+    - tests/tool_use
+  no_gpu: true
+  commands:
+    - pytest -v -s -m 'cpu_test' tool_use
--- a/.buildkite/test_areas/weight_loading.yaml
+++ b/.buildkite/test_areas/weight_loading.yaml
@ -0,0 +1,25 @@
+group: Weight Loading
+depends_on: 
+  - image-build
+steps:
+- label: Weight Loading Multiple GPU  # 33min
+  timeout_in_minutes: 45
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+
+- label: Weight Loading Multiple GPU - Large Models # optional
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  gpu: a100
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@ -13,7 +13,7 @@ jobs:

    steps:
      - name: Checkout repository
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1

      - name: Set up Python
        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@ -12,7 +12,7 @@ jobs:
    timeout-minutes: 30

    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v6.0.1

      - uses: astral-sh/setup-uv@v7
        with:
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -16,7 +16,7 @@ jobs:
  pre-commit:
    runs-on: ubuntu-latest
    steps:
-    - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+    - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
    - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
      with:
        python-version: "3.12"
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -15,7 +15,7 @@ jobs:
      actions: write
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
+      - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
        with:
          # Increasing this value ensures that changes to this workflow
          # propagate to all issues and PRs in days rather than months
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -874,7 +874,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
    set(SRCS
-       "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
+       "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu"
+       "csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu"
+       "csrc/quantization/cutlass_w4a8/w4a8_utils.cu"
+       )

    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@ -944,7 +947,6 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 set(VLLM_MOE_EXT_SRC
  "csrc/moe/torch_bindings.cpp"
  "csrc/moe/moe_align_sum_kernels.cu"
-  "csrc/moe/moe_lora_align_sum_kernels.cu"
  "csrc/moe/topk_softmax_kernels.cu")

 if(VLLM_GPU_LANG STREQUAL "CUDA")
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@ -96,8 +96,9 @@ start_server() {
    # This correctly passes each element as a separate argument.
    if [[ -n "$profile_dir" ]]; then
        # Start server with profiling enabled
-        VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
-            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
+        local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}"
+        VLLM_SERVER_DEV_MODE=1 \
+            vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
    else
        # Start server without profiling
        VLLM_SERVER_DEV_MODE=1 \
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@ -32,12 +32,11 @@ def benchmark_propose(args):

        model_config = ModelConfig(
            model="facebook/opt-125m",
-            task="generate",
            max_model_len=args.num_token + args.num_spec_token,
            tokenizer="facebook/opt-125m",
            tokenizer_mode="auto",
            dtype="auto",
-            seed=None,
+            seed=0,
            trust_remote_code=False,
        )
        proposer = NgramProposer(
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -574,7 +574,7 @@ async def benchmark(
    )
    print(
        "{:<40} {:<10.2f}".format(
-            "Total Token throughput (tok/s):", metrics.total_token_throughput
+            "Total token throughput (tok/s):", metrics.total_token_throughput
        )
    )

@ -963,8 +963,7 @@ def create_argument_parser():
    parser.add_argument(
        "--profile",
        action="store_true",
-        help="Use Torch Profiler. The endpoint must be launched with "
-        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+        help="Use vLLM Profiling. --profiler-config must be provided on the server.",
    )
    parser.add_argument(
        "--result-dir",
--- a/benchmarks/kernels/benchmark_mla_k_concat.py
+++ b/benchmarks/kernels/benchmark_mla_k_concat.py
@ -0,0 +1,150 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark script comparing torch.cat vs direct copy for k_nope/k_pe concatenation
+in MLA (Multi-head Latent Attention) prefill.
+
+This validates that the optimization from commit 8d4142bd is beneficial across
+various batch sizes, not just the originally tested batch size of 32768.
+"""
+
+import time
+from collections.abc import Callable
+
+import torch
+
+# DeepSeek-V3 MLA dimensions
+NUM_HEADS = 128
+QK_NOPE_HEAD_DIM = 128
+PE_DIM = 64
+
+
+def cat_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
+    """Original torch.cat approach with expand."""
+    return torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
+
+
+def direct_copy_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
+    """Optimized direct copy approach (avoids expand + cat overhead)."""
+    k = torch.empty(
+        (*k_nope.shape[:-1], k_nope.shape[-1] + k_pe.shape[-1]),
+        dtype=k_nope.dtype,
+        device=k_nope.device,
+    )
+    k[..., : k_nope.shape[-1]] = k_nope
+    k[..., k_nope.shape[-1] :] = k_pe
+    return k
+
+
+def benchmark_method(
+    method: Callable,
+    k_nope: torch.Tensor,
+    k_pe: torch.Tensor,
+    num_warmup: int = 10,
+    num_iters: int = 100,
+) -> float:
+    """Benchmark a concatenation method and return mean latency in ms."""
+    # Warmup
+    for _ in range(num_warmup):
+        _ = method(k_nope, k_pe)
+    torch.cuda.synchronize()
+
+    # Benchmark
+    start = time.perf_counter()
+    for _ in range(num_iters):
+        _ = method(k_nope, k_pe)
+    torch.cuda.synchronize()
+    end = time.perf_counter()
+
+    return (end - start) / num_iters * 1000  # Convert to ms
+
+
+@torch.inference_mode()
+def run_benchmark(dtype: torch.dtype, dtype_name: str):
+    """Run benchmark for a specific dtype."""
+    torch.set_default_device("cuda")
+
+    # Batch sizes to test (powers of 2 from 32 to 65536)
+    batch_sizes = [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536]
+
+    print("=" * 80)
+    print("Benchmark: torch.cat vs direct copy for MLA k_nope/k_pe concatenation")
+    print("=" * 80)
+    print(
+        f"Tensor shapes: k_nope=[B, {NUM_HEADS}, {QK_NOPE_HEAD_DIM}], "
+        f"k_pe=[B, 1, {PE_DIM}]"
+    )
+    print(f"dtype: {dtype_name}")
+    print()
+    print(
+        f"{'Batch Size':>12} | {'cat (ms)':>10} | {'direct (ms)':>12} | "
+        f"{'Speedup':>8} | {'Reduction':>10}"
+    )
+    print("-" * 70)
+
+    results = []
+    for batch_size in batch_sizes:
+        # Create input tensors (generate in float32 then convert for FP8 compatibility)
+        k_nope = torch.randn(
+            batch_size, NUM_HEADS, QK_NOPE_HEAD_DIM, dtype=torch.float32, device="cuda"
+        ).to(dtype)
+        k_pe = torch.randn(
+            batch_size, 1, PE_DIM, dtype=torch.float32, device="cuda"
+        ).to(dtype)
+
+        # Benchmark both methods
+        cat_time = benchmark_method(cat_method, k_nope, k_pe)
+        direct_time = benchmark_method(direct_copy_method, k_nope, k_pe)
+
+        speedup = cat_time / direct_time
+        reduction = (1 - direct_time / cat_time) * 100
+
+        results.append((batch_size, cat_time, direct_time, speedup, reduction))
+
+        print(
+            f"{batch_size:>12} | {cat_time:>10.3f} | {direct_time:>12.3f} | "
+            f"{speedup:>7.2f}x | {reduction:>9.1f}%"
+        )
+
+    print("=" * 80)
+
+    # Summary statistics
+    speedups = [r[3] for r in results]
+    print("\nSpeedup summary:")
+    print(f"  Min:  {min(speedups):.2f}x")
+    print(f"  Max:  {max(speedups):.2f}x")
+    print(f"  Mean: {sum(speedups) / len(speedups):.2f}x")
+
+    # Find crossover point
+    crossover_batch = None
+    for batch_size, _, _, speedup, _ in results:
+        if speedup >= 1.0:
+            crossover_batch = batch_size
+            break
+
+    print("\nConclusion:")
+    if crossover_batch:
+        print(f"  - Direct copy becomes beneficial at batch size >= {crossover_batch}")
+    # Filter for large batches (>= 512 which is typical for prefill)
+    large_batch_speedups = [r[3] for r in results if r[0] >= 512]
+    if large_batch_speedups:
+        avg_large = sum(large_batch_speedups) / len(large_batch_speedups)
+        print(f"  - For batch sizes >= 512: avg speedup = {avg_large:.2f}x")
+    print("  - MLA prefill typically uses large batches, so optimization is effective")
+
+    return results
+
+
+@torch.inference_mode()
+def main():
+    # Test bfloat16
+    print("\n")
+    run_benchmark(torch.bfloat16, "bfloat16")
+
+    # Test float8_e4m3fn
+    print("\n")
+    run_benchmark(torch.float8_e4m3fn, "float8_e4m3fn")
+
+
+if __name__ == "__main__":
+    main()
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -251,17 +251,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
        endif()

        # Build ACL with CMake
-        set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
-        set(CMAKE_BUILD_TYPE "Release")
-        set(ARM_COMPUTE_ARCH "armv8.2-a")
-        set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
-        set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
-        set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
-        set(ARM_COMPUTE_ENABLE_OPENMP "ON")
-        set(ARM_COMPUTE_ENABLE_WERROR "OFF")
-        set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
-        set(ARM_COMPUTE_BUILD_TESTING "OFF")
-
        set(_cmake_config_cmd
             ${CMAKE_COMMAND} -G Ninja -B build 
            -DARM_COMPUTE_BUILD_SHARED_LIB=OFF 
--- a/csrc/cpu/cpu_attn.cpp
+++ b/csrc/cpu/cpu_attn.cpp
@ -117,7 +117,6 @@ torch::Tensor get_scheduler_metadata(
  input.casual = casual;
  input.isa = isa;
  input.enable_kv_split = enable_kv_split;
-  TORCH_CHECK(casual, "Only supports casual mask for now.");

  VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
    CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@ -186,7 +186,7 @@ struct AttentionMetadata {
 //  - Intermediate outputs: q_tile_size * head_dim * output_buffer_elem_size + 2
 //  * q_tile_size * 4, partial output, max + sum (float)
 // Reduction scratchpad contains:
-//  - flags: bool array to indicate wether the split is finished
+//  - flags: bool array to indicate whether the split is finished
 //  - outputs: split_num * q_tile_size * head_dim * output_buffer_elem_size
 //  - max, sum: 2 * split_num * q_tile_size * 4
 class AttentionScratchPad {
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@ -444,23 +444,27 @@ __device__ inline T apply_sigmoid(T val) {
  return cuda_cast<T, float>(sigmoid_accurate(f));
 }

-template <typename T>
+template <ScoringFunc SF, typename T>
+__device__ inline T apply_scoring(T val) {
+  if constexpr (SF == SCORING_SIGMOID) {
+    return apply_sigmoid(val);
+  } else {
+    return val;
+  }
+}
+
+template <typename T, ScoringFunc SF>
 __device__ void topk_with_k2(T* output, T const* input, T const* bias,
                             cg::thread_block_tile<32> const& tile,
                             int32_t const lane_id,
-                             int const num_experts_per_group,
-                             int const scoring_func) {
+                             int const num_experts_per_group) {
  // Get the top2 per thread
  T largest = neg_inf<T>();
  T second_largest = neg_inf<T>();

  if (num_experts_per_group > WARP_SIZE) {
    for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
-      T value = input[i];
-      // Apply scoring function if needed
-      if (scoring_func == SCORING_SIGMOID) {
-        value = apply_sigmoid(value);
-      }
+      T value = apply_scoring<SF>(input[i]);
      value = value + bias[i];

      if (value > largest) {
@ -472,11 +476,7 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias,
    }
  } else {
    for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
-      T value = input[i];
-      // Apply scoring function if needed
-      if (scoring_func == SCORING_SIGMOID) {
-        value = apply_sigmoid(value);
-      }
+      T value = apply_scoring<SF>(input[i]);
      value = value + bias[i];
      largest = value;
    }
@ -501,13 +501,12 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias,
  }
 }

-template <typename T>
+template <typename T, ScoringFunc SF>
 __global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
                                    int64_t const num_tokens,
                                    int64_t const num_cases,
                                    int64_t const n_group,
-                                    int64_t const num_experts_per_group,
-                                    int const scoring_func) {
+                                    int64_t const num_experts_per_group) {
  int32_t warp_id = threadIdx.x / WARP_SIZE;
  int32_t lane_id = threadIdx.x % WARP_SIZE;

@ -525,21 +524,21 @@ __global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
    asm volatile("griddepcontrol.wait;");
 #endif
-    topk_with_k2(output, input, group_bias, tile, lane_id,
-                 num_experts_per_group, scoring_func);
+    topk_with_k2<T, SF>(output, input, group_bias, tile, lane_id,
+                        num_experts_per_group);
  }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
  asm volatile("griddepcontrol.launch_dependents;");
 #endif
 }

-template <typename T, typename IdxT>
+template <typename T, typename IdxT, ScoringFunc SF, int NGroup = -1>
 __global__ void group_idx_and_topk_idx_kernel(
    T* scores, T const* group_scores, float* topk_values, IdxT* topk_indices,
    T const* bias, int64_t const num_tokens, int64_t const n_group,
    int64_t const topk_group, int64_t const topk, int64_t const num_experts,
    int64_t const num_experts_per_group, bool renormalize,
-    double routed_scaling_factor, int scoring_func) {
+    double routed_scaling_factor) {
  int32_t warp_id = threadIdx.x / WARP_SIZE;
  int32_t lane_id = threadIdx.x % WARP_SIZE;
  int32_t case_id =
@ -549,6 +548,11 @@ __global__ void group_idx_and_topk_idx_kernel(
  topk_values += case_id * topk;
  topk_indices += case_id * topk;

+  constexpr bool kUseStaticNGroup = (NGroup > 0);
+  // use int32 to avoid implicit conversion
+  int32_t const n_group_i32 =
+      kUseStaticNGroup ? NGroup : static_cast<int32_t>(n_group);
+
  int32_t align_num_experts_per_group =
      warp_topk::round_up_to_multiple_of<WARP_SIZE>(num_experts_per_group);

@ -574,13 +578,14 @@ __global__ void group_idx_and_topk_idx_kernel(

  if (case_id < num_tokens) {
    // calculate group_idx
-    int32_t target_num_min = WARP_SIZE - n_group + topk_group;
+    int32_t target_num_min =
+        WARP_SIZE - n_group_i32 + static_cast<int32_t>(topk_group);
    // The check is necessary to avoid abnormal input
-    if (lane_id < n_group && is_finite(group_scores[lane_id])) {
+    if (lane_id < n_group_i32 && is_finite(group_scores[lane_id])) {
      value = group_scores[lane_id];
    }

-    int count_equal_to_top_value = WARP_SIZE - n_group;
+    int count_equal_to_top_value = WARP_SIZE - n_group_i32;
    int pre_count_equal_to_top_value = 0;
    // Use loop to find the largset top_group
    while (count_equal_to_top_value < target_num_min) {
@ -604,7 +609,7 @@ __global__ void group_idx_and_topk_idx_kernel(
  int count_equalto_topkth_group = 0;
  bool if_proceed_next_topk = topk_group_value != neg_inf<T>();
  if (case_id < num_tokens && if_proceed_next_topk) {
-    for (int i_group = 0; i_group < n_group; i_group++) {
+    auto process_group = [&](int i_group) {
      if ((group_scores[i_group] > topk_group_value) ||
          ((group_scores[i_group] == topk_group_value) &&
           (count_equalto_topkth_group < num_equalto_topkth_group))) {
@ -613,11 +618,10 @@ __global__ void group_idx_and_topk_idx_kernel(
             i += WARP_SIZE) {
          T candidates = neg_inf<T>();
          if (i < num_experts_per_group) {
-            // Apply scoring function (if any) and add bias
+            // apply scoring function (if any) and add bias
            T input = scores[offset + i];
            if (is_finite(input)) {
-              T score = (scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input)
-                                                          : input;
+              T score = apply_scoring<SF>(input);
              candidates = score + bias[offset + i];
            }
          }
@ -627,6 +631,17 @@ __global__ void group_idx_and_topk_idx_kernel(
          count_equalto_topkth_group++;
        }
      }
+    };
+
+    if constexpr (kUseStaticNGroup) {
+#pragma unroll
+      for (int i_group = 0; i_group < NGroup; ++i_group) {
+        process_group(i_group);
+      }
+    } else {
+      for (int i_group = 0; i_group < n_group_i32; ++i_group) {
+        process_group(i_group);
+      }
    }
    queue.done();
    __syncwarp();
@ -646,12 +661,13 @@ __global__ void group_idx_and_topk_idx_kernel(
      if (i < topk) {
        // Load the score value (without bias) for normalization
        T input = scores[s_topk_idx[i]];
-        value =
-            (scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input) : input;
+        value = apply_scoring<SF>(input);
        s_topk_value[i] = value;
      }
-      topk_sum +=
-          cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
+      if (renormalize) {
+        topk_sum +=
+            cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
+      }
    }
  }

@ -660,13 +676,9 @@ __global__ void group_idx_and_topk_idx_kernel(
  if (case_id < num_tokens) {
    if (if_proceed_next_topk) {
      for (int i = lane_id; i < topk; i += WARP_SIZE) {
-        float value;
-        if (renormalize) {
-          value = cuda_cast<float, T>(s_topk_value[i]) / topk_sum *
-                  routed_scaling_factor;
-        } else {
-          value = cuda_cast<float, T>(s_topk_value[i]) * routed_scaling_factor;
-        }
+        float base = cuda_cast<float, T>(s_topk_value[i]);
+        float value = renormalize ? (base / topk_sum * routed_scaling_factor)
+                                  : (base * routed_scaling_factor);
        topk_indices[i] = s_topk_idx[i];
        topk_values[i] = value;
      }
@ -684,6 +696,45 @@ __global__ void group_idx_and_topk_idx_kernel(
 #endif
 }

+template <typename T, typename IdxT, ScoringFunc SF>
+inline void launch_group_idx_and_topk_kernel(
+    cudaLaunchConfig_t const& config, T* scores, T* group_scores,
+    float* topk_values, IdxT* topk_indices, T const* bias,
+    int64_t const num_tokens, int64_t const n_group, int64_t const topk_group,
+    int64_t const topk, int64_t const num_experts,
+    int64_t const num_experts_per_group, bool const renormalize,
+    double const routed_scaling_factor) {
+  auto launch = [&](auto* kernel_instance2) {
+    cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
+                       topk_values, topk_indices, bias, num_tokens, n_group,
+                       topk_group, topk, num_experts, num_experts_per_group,
+                       renormalize, routed_scaling_factor);
+  };
+
+  switch (n_group) {
+    case 4: {
+      launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 4>);
+      break;
+    }
+    case 8: {
+      launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 8>);
+      break;
+    }
+    case 16: {
+      launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 16>);
+      break;
+    }
+    case 32: {
+      launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 32>);
+      break;
+    }
+    default: {
+      launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF>);
+      break;
+    }
+  }
+}
+
 template <typename T, typename IdxT>
 void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
                   IdxT* topk_indices, T const* bias, int64_t const num_tokens,
@ -694,7 +745,6 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
                   cudaStream_t const stream = 0) {
  int64_t num_cases = num_tokens * n_group;
  int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
-  auto* kernel_instance1 = &topk_with_k2_kernel<T>;
  cudaLaunchConfig_t config;
  config.gridDim = topk_with_k2_num_blocks;
  config.blockDim = BLOCK_SIZE;
@ -705,16 +755,33 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
  attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
  config.numAttrs = 1;
  config.attrs = attrs;
-  cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
-                     num_tokens, num_cases, n_group, num_experts / n_group,
-                     scoring_func);
+  auto const sf = static_cast<ScoringFunc>(scoring_func);
+  int64_t const num_experts_per_group = num_experts / n_group;
+  auto launch_topk_with_k2 = [&](auto* kernel_instance1) {
+    cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
+                       num_tokens, num_cases, n_group, num_experts_per_group);
+  };
+  switch (sf) {
+    case SCORING_NONE: {
+      auto* kernel_instance1 = &topk_with_k2_kernel<T, SCORING_NONE>;
+      launch_topk_with_k2(kernel_instance1);
+      break;
+    }
+    case SCORING_SIGMOID: {
+      auto* kernel_instance1 = &topk_with_k2_kernel<T, SCORING_SIGMOID>;
+      launch_topk_with_k2(kernel_instance1);
+      break;
+    }
+    default:
+      // should be guarded by higher level checks.
+      TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
+  }

  int64_t topk_with_k_group_num_blocks =
      (num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
  size_t dynamic_smem_in_bytes =
      warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
                                                           topk);
-  auto* kernel_instance2 = &group_idx_and_topk_idx_kernel<T, IdxT>;
  config.gridDim = topk_with_k_group_num_blocks;
  config.blockDim = BLOCK_SIZE;
  config.dynamicSmemBytes = dynamic_smem_in_bytes;
@ -723,10 +790,24 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
  attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
  config.numAttrs = 1;
  config.attrs = attrs;
-  cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
-                     topk_values, topk_indices, bias, num_tokens, n_group,
-                     topk_group, topk, num_experts, num_experts / n_group,
-                     renormalize, routed_scaling_factor, scoring_func);
+  switch (sf) {
+    case SCORING_NONE: {
+      launch_group_idx_and_topk_kernel<T, IdxT, SCORING_NONE>(
+          config, scores, group_scores, topk_values, topk_indices, bias,
+          num_tokens, n_group, topk_group, topk, num_experts,
+          num_experts_per_group, renormalize, routed_scaling_factor);
+      break;
+    }
+    case SCORING_SIGMOID: {
+      launch_group_idx_and_topk_kernel<T, IdxT, SCORING_SIGMOID>(
+          config, scores, group_scores, topk_values, topk_indices, bias,
+          num_tokens, n_group, topk_group, topk, num_experts,
+          num_experts_per_group, renormalize, routed_scaling_factor);
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
+  }
 }

 #define INSTANTIATE_NOAUX_TC(T, IdxT)                                       \
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@ -14,7 +14,6 @@

 namespace vllm {
 namespace moe {
-
 namespace batched_moe_align_block_size {

 // Note num_threads needs to be 1024 for BlockScan Reduction in the kernel.
@ -80,23 +79,30 @@ __global__ void batched_moe_align_block_size_kernel(
 }  // namespace batched_moe_align_block_size

 template <typename scalar_t>
-__global__ void moe_align_block_size_kernel(
+__device__ void _moe_align_block_size(
    const scalar_t* __restrict__ topk_ids,
    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
    int32_t* __restrict__ total_tokens_post_pad,
    int32_t* __restrict__ expert_map, int32_t num_experts,
    int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
    size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded,
-    bool has_expert_map) {
+    int32_t max_num_m_blocks, int32_t model_offset, int32_t inactive_expert_id,
+    int32_t topk_num, int32_t* token_mask, bool has_expert_map) {
  extern __shared__ int32_t shared_counts[];

-  // Use a separate threadblock to fill sorted_token_ids.
+  // Compute input buffer offsets. Typically these will all be 0, except when
+  // using Multi LoRA.
+  int sorted_token_ids_offset = max_num_tokens_padded * model_offset;
+  int expert_ids_offset = max_num_m_blocks * model_offset;
+  int cumsum_offset = (num_experts + 1) * model_offset;
+
+  // Use separate threadblocks to fill sorted_token_ids.
  // This is safe since the current kernel does not use sorted_token_ids.
-  if (blockIdx.x == 1) {
+  if (blockIdx.x % 2) {
    // Initialize sorted_token_ids with numel
    for (size_t it = threadIdx.x; it < max_num_tokens_padded;
         it += blockDim.x) {
-      sorted_token_ids[it] = numel;
+      sorted_token_ids[sorted_token_ids_offset + it] = numel;
    }
    return;
  }
@ -127,7 +133,9 @@ __global__ void moe_align_block_size_kernel(
    }
    int warp_idx = expert_id / experts_per_warp;
    int expert_offset = expert_id % experts_per_warp;
-    atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
+    int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num];
+    atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset],
+              mask);
  }

  __syncthreads();
@ -148,77 +156,44 @@ __global__ void moe_align_block_size_kernel(
  int cumsum_val;
  BlockScan(temp_storage).ExclusiveSum(expert_count, cumsum_val);
  if (expert_id <= num_experts) {
-    cumsum[expert_id] = cumsum_val;
+    cumsum[cumsum_offset + expert_id] = cumsum_val;
  }

  if (expert_id == num_experts) {
-    *total_tokens_post_pad = cumsum_val;
+    total_tokens_post_pad[model_offset] = cumsum_val;
  }

  __syncthreads();

  if (threadIdx.x < num_experts) {
-    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
-         i += block_size) {
-      expert_ids[i / block_size] = threadIdx.x;
+    for (int i = cumsum[cumsum_offset + threadIdx.x];
+         i < cumsum[cumsum_offset + threadIdx.x + 1]; i += block_size) {
+      expert_ids[expert_ids_offset + i / block_size] = threadIdx.x;
    }
  }

  // Fill remaining expert_ids with 0
-  const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x;
-  const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
-  for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) {
-    expert_ids[i] = 0;
-  }
-}
-
-template <typename scalar_t>
-__global__ void count_and_sort_expert_tokens_kernel(
-    const scalar_t* __restrict__ topk_ids,
-    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
-    int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
-    bool has_expert_map) {
-  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-  const size_t stride = blockDim.x * gridDim.x;
-
-  for (size_t i = tid; i < numel; i += stride) {
-    int32_t expert_id = topk_ids[i];
-    if (expert_id >= num_experts) {
-      continue;
-    }
-    if (has_expert_map) {
-      expert_id = expert_map[expert_id];
-      // filter invalid experts
-      if (expert_id == -1) continue;
-    }
-    int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
-    sorted_token_ids[rank_post_pad] = i;
-  }
-}
-
-template <typename scalar_t, int TOPK>
-__global__ void moe_sum_kernel(
-    scalar_t* __restrict__ out,          // [..., d]
-    const scalar_t* __restrict__ input,  // [..., topk, d]
-    const int d) {
-  const int64_t token_idx = blockIdx.x;
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    scalar_t x = 0.0;
-#pragma unroll
-    for (int k = 0; k < TOPK; ++k) {
-      x += VLLM_LDG(&input[token_idx * TOPK * d + k * d + idx]);
-    }
-    out[token_idx * d + idx] = x;
+  const size_t fill_start_idx =
+      cumsum[cumsum_offset + num_experts] / block_size + threadIdx.x;
+  for (size_t i = fill_start_idx; i < max_num_m_blocks; i += blockDim.x) {
+    expert_ids[expert_ids_offset + i] = inactive_expert_id;
  }
 }

 template <typename scalar_t, int32_t fill_threads>
-__global__ void moe_align_block_size_small_batch_expert_kernel(
+__device__ void _moe_align_block_size_small_batch_expert(
    const scalar_t* __restrict__ topk_ids,
    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
    int32_t* __restrict__ total_tokens_post_pad,
    int32_t* __restrict__ expert_map, int32_t num_experts, int32_t block_size,
-    size_t numel, int32_t max_num_tokens_padded, bool has_expert_map) {
+    size_t numel, int32_t max_num_tokens_padded, int32_t max_num_m_blocks,
+    int32_t inactive_expert_id, int32_t model_offset, int32_t topk_num,
+    int32_t* token_mask, bool has_expert_map) {
+  // Compute input buffer offsets. Typically these will all be 0, except when
+  // using Multi LoRA.
+  int sorted_token_ids_offset = max_num_tokens_padded * model_offset;
+  int expert_ids_offset = max_num_m_blocks * model_offset;
+
  // Use an additional group of threads to fill sorted_token_ids.
  // Since the current kernel will use sorted_token_ids afterward,
  // we fill sorted_token_ids within the same threadblock to make
@ -227,7 +202,7 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
    // Initialize sorted_token_ids with numel
    for (size_t it = threadIdx.x; it < max_num_tokens_padded;
         it += fill_threads) {
-      sorted_token_ids[it] = numel;
+      sorted_token_ids[sorted_token_ids_offset + it] = numel;
    }
    // Three __syncthreads() corresponding to the other threads
    __syncthreads();
@ -254,7 +229,8 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
      // filter invalid expert
      if (expert_id == -1) continue;
    }
-    ++tokens_cnts[(tid + 1) * num_experts + expert_id];
+    int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num];
+    tokens_cnts[(tid + 1) * num_experts + expert_id] += mask;
  }

  __syncthreads();
@ -277,22 +253,22 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
          CEILDIV(tokens_cnts[stride * num_experts + i - 1], block_size) *
              block_size;
    }
-    *total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
+    total_tokens_post_pad[model_offset] =
+        static_cast<int32_t>(cumsum[num_experts]);
  }

  __syncthreads();

  if (tid < num_experts) {
    for (int i = cumsum[tid]; i < cumsum[tid + 1]; i += block_size) {
-      expert_ids[i / block_size] = tid;
+      expert_ids[expert_ids_offset + i / block_size] = tid;
    }
  }

  // Fill remaining expert_ids with 0
  const size_t fill_start_idx = cumsum[num_experts] / block_size + tid;
-  const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
-  for (size_t i = fill_start_idx; i < expert_ids_size; i += stride) {
-    expert_ids[i] = 0;
+  for (size_t i = fill_start_idx; i < max_num_m_blocks; i += stride) {
+    expert_ids[expert_ids_offset + i] = inactive_expert_id;
  }

  for (size_t i = tid; i < numel; i += stride) {
@ -304,11 +280,195 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
    }
    int32_t rank_post_pad =
        tokens_cnts[tid * num_experts + expert_id] + cumsum[expert_id];
-    sorted_token_ids[rank_post_pad] = i;
-    ++tokens_cnts[tid * num_experts + expert_id];
+
+    if (token_mask == nullptr || token_mask[i / topk_num]) {
+      sorted_token_ids[sorted_token_ids_offset + rank_post_pad] = i;
+      ++tokens_cnts[tid * num_experts + expert_id];
+    }
  }
 }

+template <typename scalar_t>
+__device__ void _count_and_sort_expert_tokens(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
+    int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
+    int32_t max_num_tokens_padded, int32_t* __restrict__ token_mask,
+    int32_t model_offset, int32_t topk_num, bool has_expert_map) {
+  const size_t tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const size_t stride = blockDim.x * gridDim.y;
+
+  for (size_t i = tid; i < numel; i += stride) {
+    int32_t expert_id = topk_ids[i];
+    if (expert_id >= num_experts) {
+      continue;
+    }
+
+    if (has_expert_map) {
+      expert_id = expert_map[expert_id];
+      // filter invalid experts
+      if (expert_id == -1) continue;
+    }
+
+    if (token_mask == nullptr || token_mask[i / topk_num]) {
+      int32_t rank_post_pad = atomicAdd(
+          &cumsum_buffer[(model_offset * (num_experts + 1)) + expert_id], 1);
+      sorted_token_ids[max_num_tokens_padded * model_offset + rank_post_pad] =
+          i;
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void moe_align_block_size_kernel(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
+    int32_t* __restrict__ total_tokens_post_pad,
+    int32_t* __restrict__ expert_map, int32_t num_experts,
+    int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
+    size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded,
+    int32_t topk_num, bool has_expert_map) {
+  _moe_align_block_size(
+      topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
+      num_experts, padded_num_experts, experts_per_warp, block_size, numel,
+      cumsum, max_num_tokens_padded, CEILDIV(max_num_tokens_padded, block_size),
+      0, 0, topk_num, nullptr, has_expert_map);
+}
+
+template <typename scalar_t>
+__global__ void count_and_sort_expert_tokens_kernel(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
+    int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
+    int32_t max_num_tokens_padded, int32_t topk_num, bool has_expert_map) {
+  _count_and_sort_expert_tokens(
+      topk_ids, sorted_token_ids, cumsum_buffer, expert_map, numel, num_experts,
+      max_num_tokens_padded, nullptr, 0, topk_num, has_expert_map);
+}
+
+template <typename scalar_t, int TOPK>
+__global__ void moe_sum_kernel(
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., topk, d]
+    const int d) {
+  const int64_t token_idx = blockIdx.x;
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    scalar_t x = 0.0;
+#pragma unroll
+    for (int k = 0; k < TOPK; ++k) {
+      x += VLLM_LDG(&input[token_idx * TOPK * d + k * d + idx]);
+    }
+    out[token_idx * d + idx] = x;
+  }
+}
+
+template <typename scalar_t, int32_t fill_threads>
+__global__ void moe_align_block_size_small_batch_expert_kernel(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
+    int32_t* __restrict__ total_tokens_post_pad,
+    int32_t* __restrict__ expert_map, int32_t num_experts, int32_t block_size,
+    size_t numel, int32_t max_num_tokens_padded, int32_t topk_num,
+    bool has_expert_map) {
+  _moe_align_block_size_small_batch_expert<scalar_t, fill_threads>(
+      topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
+      num_experts, block_size, numel, max_num_tokens_padded,
+      CEILDIV(max_num_tokens_padded, block_size), 0, 0, topk_num, nullptr,
+      has_expert_map);
+}
+
+template <typename scalar_t>
+__global__ void moe_lora_align_block_size_kernel(
+    scalar_t* __restrict__ topk_ids, int32_t* __restrict__ token_lora_mapping,
+    int64_t block_size, int32_t* __restrict__ expert_map, int num_experts,
+    int max_loras, size_t numel, int max_num_tokens_padded,
+    int max_num_m_blocks, int32_t* __restrict__ sorted_token_ids,
+    int32_t* __restrict__ expert_ids, int32_t topk_num,
+    int32_t* total_tokens_post_pad, int32_t* adapter_enabled,
+    int32_t* __restrict__ cumsum, int32_t experts_per_warp,
+    int32_t padded_num_experts, int32_t* lora_ids,
+    int32_t* __restrict__ token_mask, bool has_expert_map) {
+  int lora_idx = blockIdx.x / 2;
+  int lora_id = lora_ids[lora_idx];
+  if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
+    return;
+  }
+
+  // Populate the token_mask based on the token-LoRA mapping
+  int num_tokens = numel / topk_num;
+  if (threadIdx.x == 0) {
+    total_tokens_post_pad[lora_id] = 0;
+
+    for (int i = 0; i < num_tokens; i++) {
+      token_mask[(lora_id * num_tokens) + i] =
+          (int)token_lora_mapping[i] == lora_id;
+    }
+  }
+
+  __syncthreads();
+
+  _moe_align_block_size(
+      topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
+      num_experts, padded_num_experts, experts_per_warp, block_size, numel,
+      cumsum, max_num_tokens_padded, max_num_m_blocks, lora_id, -1, topk_num,
+      &token_mask[(lora_id * num_tokens)], has_expert_map);
+}
+
+template <typename scalar_t>
+__global__ void lora_count_and_sort_expert_tokens_kernel(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
+    int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
+    int32_t max_num_tokens_padded, int32_t topk_num, int32_t* token_mask,
+    int32_t* lora_ids, bool has_expert_map) {
+  int lora_idx = blockIdx.x;
+  int lora_id = lora_ids[lora_idx];
+  if (lora_id == -1) {
+    return;
+  }
+
+  int num_tokens = numel / topk_num;
+
+  _count_and_sort_expert_tokens(
+      topk_ids, sorted_token_ids, cumsum_buffer, expert_map, numel, num_experts,
+      max_num_tokens_padded, &token_mask[(lora_id * num_tokens)], lora_id,
+      topk_num, has_expert_map);
+}
+
+template <typename scalar_t, int32_t fill_threads>
+__global__ void moe_lora_align_block_size_small_batch_expert_kernel(
+    scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping,
+    int64_t block_size, int32_t* __restrict__ expert_map, int num_experts,
+    int max_loras, size_t numel, int max_num_tokens_padded,
+    int max_num_m_blocks, int32_t* __restrict__ sorted_token_ids,
+    int32_t* __restrict__ expert_ids, int topk_num,
+    int32_t* total_tokens_post_pad, int32_t* adapter_enabled, int32_t* lora_ids,
+    int32_t* token_mask, bool has_expert_map) {
+  int lora_idx = blockIdx.x;
+  int lora_id = lora_ids[lora_idx];
+  if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
+    return;
+  }
+
+  int num_tokens = numel / topk_num;
+  if (threadIdx.x == 0) {
+    total_tokens_post_pad[lora_id] = 0;
+
+    for (int i = 0; i < num_tokens; i++) {
+      token_mask[(lora_id * num_tokens) + i] =
+          (int)token_lora_mapping[i] == lora_id;
+    }
+  }
+
+  __syncthreads();
+
+  _moe_align_block_size_small_batch_expert<scalar_t, fill_threads>(
+      topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
+      num_experts, block_size, numel, max_num_tokens_padded, max_num_m_blocks,
+      -1, lora_id, topk_num, &token_mask[(lora_id * num_tokens)],
+      has_expert_map);
+}
+
 }  // namespace moe
 }  // namespace vllm

@ -365,7 +525,8 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
              experts_ids.data_ptr<int32_t>(),
              num_tokens_post_pad.data_ptr<int32_t>(),
              expert_map.data_ptr<int32_t>(), num_experts, block_size,
-              topk_ids.numel(), sorted_token_ids.size(0), has_expert_map);
+              topk_ids.numel(), sorted_token_ids.size(0), topk_ids.size(1),
+              has_expert_map);
        } else {
          torch::Tensor cumsum_buffer =
              torch::empty({num_experts + 1}, options_int);
@ -386,21 +547,23 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
              expert_map.data_ptr<int32_t>(), num_experts, padded_num_experts,
              experts_per_warp, block_size, topk_ids.numel(),
              cumsum_buffer.data_ptr<int32_t>(), sorted_token_ids.size(0),
-              has_expert_map);
+              topk_ids.size(1), has_expert_map);

          const int block_threads = std::min(256, (int)threads);
          const int num_blocks =
              (topk_ids.numel() + block_threads - 1) / block_threads;
          const int max_blocks = 65535;
          const int actual_blocks = std::min(num_blocks, max_blocks);
+          dim3 gridDims(1, actual_blocks);

          auto sort_kernel =
              vllm::moe::count_and_sort_expert_tokens_kernel<scalar_t>;
-          sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
+          sort_kernel<<<gridDims, block_threads, 0, stream>>>(
              topk_ids.data_ptr<scalar_t>(),
              sorted_token_ids.data_ptr<int32_t>(),
              cumsum_buffer.data_ptr<int32_t>(), expert_map.data_ptr<int32_t>(),
-              topk_ids.numel(), num_experts, has_expert_map);
+              topk_ids.numel(), num_experts, sorted_token_ids.size(0),
+              topk_ids.size(1), has_expert_map);
        }
      });
 }
@ -474,3 +637,123 @@ void moe_sum(torch::Tensor& input,   // [num_tokens, topk, hidden_size]
      break;
  }
 }
+
+void moe_lora_align_block_size(
+    torch::Tensor topk_ids, torch::Tensor token_lora_mapping,
+    int64_t num_experts, int64_t block_size, int64_t max_loras,
+    int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
+    torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
+    torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
+    torch::Tensor lora_ids, std::optional<torch::Tensor> maybe_expert_map) {
+  const int topk_num = topk_ids.size(1);
+
+  TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
+
+  int device_max_shared_mem;
+  auto dev = topk_ids.get_device();
+  cudaDeviceGetAttribute(&device_max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  int64_t padded_num_experts =
+      ((num_experts + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
+
+  // BlockScan uses 1024 threads and assigns one thread per expert.
+  TORCH_CHECK(padded_num_experts < 1024,
+              "padded_num_experts must be less than 1024");
+
+  auto options_int =
+      torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
+  torch::Tensor token_mask =
+      torch::empty({max_loras * topk_ids.size(0)}, options_int);
+  bool has_expert_map = maybe_expert_map.has_value();
+  torch::Tensor expert_map;
+  if (has_expert_map) {
+    expert_map = maybe_expert_map.value();
+  } else {
+    expert_map = torch::empty({0}, options_int);
+  }
+
+  VLLM_DISPATCH_INTEGRAL_TYPES(
+      topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] {
+        bool small_batch_expert_mode =
+            (topk_ids.numel() < 1024) && (num_experts <= 64);
+
+        if (small_batch_expert_mode) {
+          const int32_t num_thread = max((int32_t)num_experts, 128);
+          const int32_t shared_mem =
+              (num_thread + 1) * num_experts * sizeof(int32_t) +
+              (num_experts + 1) * sizeof(int32_t);
+          if (shared_mem > device_max_shared_mem) {
+            TORCH_CHECK(false, "Shared memory usage exceeds device limit.");
+          }
+
+          // threadIdx.x >= fill_threads: counting experts and aligning
+          // threadIdx.x < fill_threads: filling sorted_token_ids
+          constexpr int32_t fill_threads = 256;
+
+          dim3 blockDim(num_thread + fill_threads);
+          auto kernel =
+              vllm::moe::moe_lora_align_block_size_small_batch_expert_kernel<
+                  scalar_t, fill_threads>;
+          AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
+              (void*)kernel, shared_mem));
+          kernel<<<max_loras, blockDim, shared_mem, stream>>>(
+              topk_ids.data_ptr<scalar_t>(),
+              token_lora_mapping.data_ptr<int32_t>(), block_size,
+              expert_map.data_ptr<int32_t>(), num_experts, max_loras,
+              topk_ids.numel(), max_num_tokens_padded, max_num_m_blocks,
+              sorted_token_ids.data_ptr<int32_t>(),
+              expert_ids.data_ptr<int32_t>(), topk_num,
+              num_tokens_post_pad.data_ptr<int32_t>(),
+              adapter_enabled.data_ptr<int32_t>(), lora_ids.data_ptr<int32_t>(),
+              token_mask.data_ptr<int32_t>(), has_expert_map);
+        } else {
+          int num_thread = 1024;
+          dim3 blockDim(num_thread);
+          size_t num_warps = CEILDIV(padded_num_experts, WARP_SIZE);
+
+          size_t shared_mem_size = num_warps * WARP_SIZE * sizeof(int32_t);
+
+          // cumsum buffer
+          torch::Tensor cumsum =
+              torch::zeros({max_loras * (num_experts + 1)}, options_int);
+
+          auto align_kernel =
+              vllm::moe::moe_lora_align_block_size_kernel<scalar_t>;
+
+          // launch two threadblocks for each lora
+          // blockIdx.x % 2 == 0: counting experts and aligning
+          // blockIdx.x % 2 == 1: filling sorted_token_ids
+          align_kernel<<<max_loras * 2, blockDim, shared_mem_size, stream>>>(
+              topk_ids.data_ptr<scalar_t>(),
+              token_lora_mapping.data_ptr<int32_t>(), block_size,
+              expert_map.data_ptr<int32_t>(), num_experts, max_loras,
+              topk_ids.numel(), max_num_tokens_padded, max_num_m_blocks,
+              sorted_token_ids.data_ptr<int32_t>(),
+              expert_ids.data_ptr<int32_t>(), topk_num,
+              num_tokens_post_pad.data_ptr<int32_t>(),
+              adapter_enabled.data_ptr<int32_t>(), cumsum.data_ptr<int32_t>(),
+              WARP_SIZE, padded_num_experts, lora_ids.data_ptr<int32_t>(),
+              token_mask.data_ptr<int32_t>(), has_expert_map);
+
+          const int block_threads = std::min(256, (int)num_thread);
+          const int num_blocks =
+              (topk_ids.numel() + block_threads - 1) / block_threads;
+
+          const int max_blocks = 65535;
+          const int actual_blocks = std::min(num_blocks, max_blocks);
+
+          dim3 gridDims(max_loras, actual_blocks);
+          auto sort_kernel =
+              vllm::moe::lora_count_and_sort_expert_tokens_kernel<scalar_t>;
+
+          sort_kernel<<<gridDims, block_threads, 0, stream>>>(
+              topk_ids.data_ptr<scalar_t>(),
+              sorted_token_ids.data_ptr<int32_t>(), cumsum.data_ptr<int32_t>(),
+              expert_map.data_ptr<int32_t>(), topk_ids.numel(), num_experts,
+              max_num_tokens_padded, topk_num, token_mask.data_ptr<int32_t>(),
+              lora_ids.data_ptr<int32_t>(), has_expert_map);
+        }
+      });
+}
--- a/csrc/moe/moe_lora_align_sum_kernels.cu
+++ b/csrc/moe/moe_lora_align_sum_kernels.cu
@ -1,174 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <torch/all.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-
-#include <ATen/ATen.h>
-#include <ATen/cuda/Atomic.cuh>
-
-#include "../cuda_compat.h"
-#include "../dispatch_utils.h"
-#include "core/math.hpp"
-
-namespace {
-
-__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
-                                         int32_t col) {
-  return row * total_col + col;
-}
-
-}  // namespace
-
-// TODO: Refactor common parts with moe_align_sum_kernels
-template <typename scalar_t, typename token_cnts_t>
-__global__ void moe_lora_align_sum_kernel(
-    scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping,
-    int64_t block_size, int num_experts, int max_loras, size_t numel,
-    int max_num_tokens_padded, int max_num_m_blocks,
-    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
-    int topk_num, int32_t* total_tokens_post_pad, int32_t* adapter_enabled,
-    int32_t* lora_ids) {
-  const size_t tokens_per_thread = div_ceil(numel, blockDim.x);
-  const size_t start_idx = threadIdx.x * tokens_per_thread;
-
-  int lora_idx = blockIdx.x;
-  int lora_id = lora_ids[lora_idx];
-  if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
-    return;
-  }
-  extern __shared__ int32_t shared_mem[];
-  int32_t* cumsum = shared_mem;
-  token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + num_experts + 1);
-
-  // Initialize sorted_token_ids with numel
-  for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
-    sorted_token_ids[lora_id * max_num_tokens_padded + it] = numel;
-  }
-
-  // Initialize expert_ids with -1
-  for (size_t it = threadIdx.x; it < max_num_m_blocks; it += blockDim.x) {
-    expert_ids[lora_id * max_num_m_blocks + it] = -1;
-  }
-
-  // Initialize total_tokens_post_pad with 0
-  if (threadIdx.x == 0) {
-    total_tokens_post_pad[lora_id] = 0;
-  }
-
-  for (int i = 0; i < num_experts; ++i) {
-    tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
-  }
-
-  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
-    int mask = token_lora_mapping[i / topk_num] == lora_id;
-    int idx = index(num_experts, threadIdx.x + 1, topk_ids[i]);
-    tokens_cnts[idx] += mask;
-  }
-
-  __syncthreads();
-
-  // For each expert we accumulate the token counts from the different threads.
-  if (threadIdx.x < num_experts) {
-    tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
-    for (int i = 1; i <= blockDim.x; ++i) {
-      tokens_cnts[index(num_experts, i, threadIdx.x)] +=
-          tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
-    }
-  }
-
-  __syncthreads();
-
-  // We accumulate the token counts of all experts in thread 0.
-  if (threadIdx.x == 0) {
-    cumsum[0] = 0;
-    for (int i = 1; i <= num_experts; ++i) {
-      cumsum[i] = cumsum[i - 1] +
-                  div_ceil(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
-                           block_size) *
-                      block_size;
-    }
-    total_tokens_post_pad[lora_id] = static_cast<int32_t>(cumsum[num_experts]);
-  }
-
-  __syncthreads();
-
-  /**
-   * For each expert, each thread processes the tokens of the corresponding
-   * blocks and stores the corresponding expert_id for each block.
-   */
-  if (threadIdx.x < num_experts) {
-    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
-         i += block_size) {
-      expert_ids[index(max_num_m_blocks, lora_id, i / block_size)] =
-          threadIdx.x;
-    }
-  }
-
-  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
-    int32_t expert_id = topk_ids[i];
-    /** The cumsum[expert_id] stores the starting index of the tokens that the
-     * expert with expert_id needs to process, and
-     * tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
-     * processed by the expert with expert_id within the current thread's token
-     * shard.
-     */
-    int32_t rank_post_pad =
-        tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
-        cumsum[expert_id];
-
-    int mask = (int)token_lora_mapping[i / topk_num] == lora_id;
-    atomicAdd(
-        &sorted_token_ids[index(max_num_tokens_padded, lora_id, rank_post_pad)],
-        (i - numel) * mask);
-    tokens_cnts[index(num_experts, threadIdx.x, expert_id)] += mask;
-  }
-}
-
-void moe_lora_align_block_size(
-    torch::Tensor topk_ids, torch::Tensor token_lora_mapping,
-    int64_t num_experts, int64_t block_size, int64_t max_loras,
-    int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
-    torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
-    torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
-    torch::Tensor lora_ids) {
-  const int topk_num = topk_ids.size(1);
-
-  TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
-
-  int device_max_shared_mem;
-  auto dev = topk_ids.get_device();
-  cudaDeviceGetAttribute(&device_max_shared_mem,
-                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  const int32_t num_thread = max((int32_t)num_experts, 128);  // WARP_SIZE,
-  TORCH_CHECK(num_thread <= 1024,
-              "num_thread must be less than 1024, "
-              "and fallback is not implemented yet.");
-  const int32_t shared_mem = (num_thread + 1) * num_experts * sizeof(int32_t) +
-                             (num_experts + 1) * sizeof(int32_t);
-
-  if (shared_mem > device_max_shared_mem) {
-    TORCH_CHECK(false,
-                "Shared memory usage exceeds device limit, and global memory "
-                "fallback is not implemented yet.");
-  }
-
-  VLLM_DISPATCH_INTEGRAL_TYPES(
-      topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] {
-        dim3 blockDim(num_thread);
-        auto kernel = moe_lora_align_sum_kernel<scalar_t, int32_t>;
-        AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
-            (void*)kernel, shared_mem));
-        kernel<<<max_loras, blockDim, shared_mem, stream>>>(
-            topk_ids.data_ptr<scalar_t>(),
-            token_lora_mapping.data_ptr<int32_t>(), block_size, num_experts,
-            max_loras, topk_ids.numel(), max_num_tokens_padded,
-            max_num_m_blocks, sorted_token_ids.data_ptr<int32_t>(),
-            expert_ids.data_ptr<int32_t>(), topk_num,
-            num_tokens_post_pad.data_ptr<int32_t>(),
-            adapter_enabled.data_ptr<int32_t>(), lora_ids.data_ptr<int32_t>());
-      });
-}
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@ -27,7 +27,7 @@ void moe_lora_align_block_size(
    int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
    torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
    torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
-    torch::Tensor lora_ids);
+    torch::Tensor lora_ids, std::optional<torch::Tensor> maybe_expert_map);
 #ifndef USE_ROCM
 torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                             torch::Tensor b_qweight, torch::Tensor b_scales,
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@ -47,7 +47,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      "                     Tensor !experts_ids,"
      "                     Tensor !num_tokens_post_pad,"
      "                     Tensor !adapter_enabled,"
-      "                     Tensor !lora_ids) -> () ");
+      "                     Tensor !lora_ids,"
+      "                     Tensor? maybe_expert_map) -> () ");
  m.impl("moe_lora_align_block_size", torch::kCUDA, &moe_lora_align_block_size);

 #ifndef USE_ROCM
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -262,7 +262,8 @@ void get_cutlass_moe_mm_data(
 void get_cutlass_moe_mm_problem_sizes(
    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets);
+    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
+    std::optional<bool> force_swap_ab = std::nullopt);

 void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
                                  torch::Tensor& problem_sizes1,
--- a/csrc/quantization/cutlass_w4a8/get_group_starts.cuh
+++ b/csrc/quantization/cutlass_w4a8/get_group_starts.cuh
@ -0,0 +1,104 @@
+// see csrc/quantization/w8a8/cutlass/moe/get_group_starts.cuh
+#pragma once
+
+#include <cuda.h>
+#include <torch/all.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include "core/scalar_type.hpp"
+#include "cutlass/bfloat16.h"
+#include "cutlass/float8.h"
+
+// ElementB is int32 (packed int4)
+// ElementGroupScale is cutlass::Array<cutlass::float_e4m3_t, 8> (packed fp8)
+template <typename ElementA, typename ElementB, typename ElementC,
+          typename ElementAccumulator, typename ElementGroupScale>
+__global__ void get_group_gemm_starts(
+    int64_t* expert_offsets, ElementA** a_offsets, ElementB** b_offsets,
+    ElementC** out_offsets, ElementAccumulator** a_scales_offsets,
+    ElementAccumulator** b_scales_offsets,
+    ElementGroupScale** b_group_scales_offsets, ElementA* a_base_as_int,
+    ElementB* b_base_as_int, ElementC* out_base_as_int,
+    ElementAccumulator* a_scales_base_as_int,
+    ElementAccumulator* b_scales_base_as_int,
+    ElementGroupScale* b_group_scales_base_as_int, int64_t n, int64_t k,
+    int64_t scale_k) {
+  int expert_id = threadIdx.x;
+
+  int64_t expert_offset = expert_offsets[expert_id];
+
+  // same as w8a8
+  a_offsets[expert_id] = a_base_as_int + expert_offset * k;
+  out_offsets[expert_id] = out_base_as_int + expert_offset * n;
+  a_scales_offsets[expert_id] = a_scales_base_as_int + expert_offset;
+  b_scales_offsets[expert_id] = b_scales_base_as_int + (n * expert_id);
+
+  // w4a8 specific
+  constexpr int pack_factor = 8;  // pack 8 int4 into int32
+  b_offsets[expert_id] = b_base_as_int + (expert_id * k * n / pack_factor);
+  b_group_scales_offsets[expert_id] =
+      b_group_scales_base_as_int + (expert_id * scale_k * n);
+}
+
+#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE)                  \
+  else if (out_tensors.dtype() == TENSOR_C_TYPE) {                       \
+    get_group_gemm_starts<cutlass::float_e4m3_t, int32_t, C_TYPE, float, \
+                          cutlass::Array<cutlass::float_e4m3_t, 8>>      \
+        <<<1, num_experts, 0, stream>>>(                                 \
+            static_cast<int64_t*>(expert_offsets.data_ptr()),            \
+            static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()),     \
+            static_cast<int32_t**>(b_ptrs.data_ptr()),                   \
+            static_cast<C_TYPE**>(out_ptrs.data_ptr()),                  \
+            static_cast<float**>(a_scales_ptrs.data_ptr()),              \
+            static_cast<float**>(b_scales_ptrs.data_ptr()),              \
+            static_cast<cutlass::Array<cutlass::float_e4m3_t, 8>**>(     \
+                b_group_scales_ptrs.data_ptr()),                         \
+            static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()),   \
+            static_cast<int32_t*>(b_tensors.data_ptr()),                 \
+            static_cast<C_TYPE*>(out_tensors.data_ptr()),                \
+            static_cast<float*>(a_scales.data_ptr()),                    \
+            static_cast<float*>(b_scales.data_ptr()),                    \
+            static_cast<cutlass::Array<cutlass::float_e4m3_t, 8>*>(      \
+                b_group_scales.data_ptr()),                              \
+            n, k, scale_k);                                              \
+  }
+
+namespace {
+
+void run_get_group_gemm_starts(
+    torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
+    torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
+    torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
+    torch::Tensor& b_group_scales_ptrs, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor& out_tensors,
+    torch::Tensor const& a_scales, torch::Tensor const& b_scales,
+    torch::Tensor const& b_group_scales, const int64_t b_group_size) {
+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b_tensors.dtype() == torch::kInt32);  // int4 8x packed into int32
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_group_scales.dtype() ==
+              torch::kFloat8_e4m3fn);  // the underlying torch type is e4m3
+  TORCH_CHECK(out_tensors.dtype() ==
+              torch::kBFloat16);  // only support bf16 for now
+  // expect int64_t to avoid overflow during offset calculations
+  TORCH_CHECK(expert_offsets.dtype() == torch::kInt64);
+
+  int num_experts = static_cast<int>(expert_offsets.size(0));
+  // logical k, n
+  int64_t n = out_tensors.size(1);
+  int64_t k = a_tensors.size(1);
+  int64_t scale_k = cutlass::ceil_div(k, b_group_size);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
+
+  if (false) {
+  }
+  __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t)
+  __CALL_GET_STARTS_KERNEL(torch::kFloat16, half)
+  else {
+    TORCH_CHECK(false, "Invalid output type (must be float16 or bfloat16)");
+  }
+}
+
+}  // namespace
--- a/csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu
+++ b/csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu
@ -0,0 +1,483 @@
+#include <vector>
+#include <tuple>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/mixed_dtype_utils.hpp"
+
+// vllm includes
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+#include "cutlass_extensions/torch_utils.hpp"
+#include "cutlass_extensions/common.hpp"
+
+#include "core/registration.h"
+#include "get_group_starts.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+#include "w4a8_utils.cuh"
+
+namespace vllm::cutlass_w4a8_moe {
+
+using namespace cute;
+
+// -------------------------------------------------------------------------------------
+// Static configuration shared across all instantiations
+// -------------------------------------------------------------------------------------
+using ProblemShape =
+    cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;  // <M,N,K> per
+                                                             // group
+using MmaType = cutlass::float_e4m3_t;
+using QuantType = cutlass::int4b_t;
+
+constexpr int TileShapeK = 128 * 8 / sizeof_bits<MmaType>::value;
+static int constexpr PackFactor = 8;  // 8 int4 packed into int32
+
+// A matrix configuration
+using ElementA = MmaType;
+using LayoutA = cutlass::layout::RowMajor;  // Layout type for A matrix operand
+constexpr int AlignmentA =
+    128 /
+    cutlass::sizeof_bits<ElementA>::value;  // Alignment of A matrix in units of
+                                            // elements (up to 16 bytes)
+
+// B matrix configuration
+using ElementB = QuantType;  // Element type for B matrix operand
+using LayoutB =
+    cutlass::layout::ColumnMajor;  // Layout type for B matrix operand
+constexpr int AlignmentB =
+    128 / cutlass::sizeof_bits<
+              ElementB>::value;  // Memory access granularity/alignment of B
+                                 // matrix in units of elements (up to 16 bytes)
+
+// This example manually swaps and transposes, so keep transpose of input
+// layouts
+using LayoutA_Transpose =
+    typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+using LayoutB_Transpose =
+    typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+
+// Need to pass a pointer type to make the 3rd dimension of Stride be _0
+using StrideA =
+    cute::remove_pointer_t<cutlass::detail::TagToStrideA_t<LayoutA*>>;
+using StrideB =
+    cute::remove_pointer_t<cutlass::detail::TagToStrideB_t<LayoutB*>>;
+
+// Define the CuTe layout for reoredered quantized tensor B
+// LayoutAtomQuant places values that will be read by the same thread in
+// contiguous locations in global memory. It specifies the reordering within a
+// single warp's fragment
+using LayoutAtomQuant =
+    decltype(cutlass::compute_memory_reordering_atom<MmaType>());
+using LayoutB_Reordered = decltype(cute::tile_to_shape(
+    LayoutAtomQuant{}, Layout<Shape<int, int, Int<1>>, StrideB>{}));
+
+using ElementScale = cutlass::float_e4m3_t;
+using LayoutScale = cutlass::layout::RowMajor;
+
+// C/D matrix configuration
+using ElementC =
+    cutlass::bfloat16_t;  // Element type for C and D matrix operands
+using LayoutC =
+    cutlass::layout::RowMajor;  // Layout type for C and D matrix operands
+constexpr int AlignmentC =
+    128 / cutlass::sizeof_bits<
+              ElementC>::value;  // Memory access granularity/alignment of C
+                                 // matrix in units of elements (up to 16 bytes)
+
+// D matrix configuration
+using ElementD = ElementC;
+using LayoutD = LayoutC;
+constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+// Core kernel configurations
+using ElementAccumulator = float;     // Element type for internal accumulation
+using ArchTag = cutlass::arch::Sm90;  // Tag indicating the minimum SM that
+                                      // supports the intended feature
+using OperatorClass = cutlass::arch::OpClassTensorOp;  // Operator class tag
+using StageCountType =
+    cutlass::gemm::collective::StageCountAuto;  // Stage count maximized based
+                                                // on the tile size
+
+// per-channel and per-token scales for epilogue
+using ElementSChannel = float;
+
+template <class TileShape_MN, class ClusterShape_MNK, class KernelSchedule,
+          class EpilogueSchedule>
+struct W4A8GroupedGemmKernel {
+  using TileShape =
+      decltype(cute::append(TileShape_MN{}, cute::Int<TileShapeK>{}));
+  using ClusterShape = ClusterShape_MNK;
+
+  // per-channel, per-token scales epilogue
+  using ChTokScalesEpilogue =
+      typename vllm::c3x::ScaledEpilogueArray<ElementAccumulator, ElementD,
+                                              TileShape>;
+  using EVTCompute = typename ChTokScalesEpilogue::EVTCompute;
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, TileShape, ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
+          ElementSChannel, ElementC,
+          typename cutlass::layout::LayoutTranspose<LayoutC>::type*, AlignmentC,
+          ElementD, typename cutlass::layout::LayoutTranspose<LayoutD>::type*,
+          AlignmentD, EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  // =========================================================== MIXED INPUT
+  // WITH SCALES
+  // ===========================================================================
+  // The Scale information must get paired with the operand that will be scaled.
+  // In this example, B is scaled so we make a tuple of B's information and the
+  // scale information.
+  using CollectiveMainloopShuffled =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass,
+          cute::tuple<ElementB, cutlass::Array<ElementScale, 8>>,
+          LayoutB_Reordered*, AlignmentB, ElementA, LayoutA_Transpose*,
+          AlignmentA, ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using GemmKernelShuffled = cutlass::gemm::kernel::GemmUniversal<
+      ProblemShape, CollectiveMainloopShuffled, CollectiveEpilogue>;
+
+  using GemmShuffled =
+      cutlass::gemm::device::GemmUniversalAdapter<GemmKernelShuffled>;
+
+  using StrideC = typename GemmKernelShuffled::InternalStrideC;
+  using StrideD = typename GemmKernelShuffled::InternalStrideD;
+
+  using StrideC_ref = cutlass::detail::TagToStrideC_t<LayoutC>;
+  using StrideD_ref = cutlass::detail::TagToStrideC_t<LayoutD>;
+  using StrideS = typename CollectiveMainloopShuffled::StrideScale;
+  using StrideS_ref = cutlass::detail::TagToStrideB_t<LayoutScale>;
+
+  // static asserts for passing in strides/layouts
+  // pack to 2x int64
+  static_assert(sizeof(StrideS) == 2 * sizeof(int64_t));
+  // pack to 3xint32,
+  static_assert(sizeof(LayoutB_Reordered) % sizeof(int32_t) == 0,
+                "LayoutB_Reordered size must be divisible by 4 bytes");
+
+  static void grouped_mm(
+      torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
+      const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
+      const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
+      const int64_t b_group_size, const torch::Tensor& expert_offsets,
+      const torch::Tensor& problem_sizes_torch, const torch::Tensor& a_strides,
+      const torch::Tensor& b_strides, const torch::Tensor& c_strides,
+      const torch::Tensor& group_scale_strides) {
+    auto device = a_tensors.device();
+    auto device_id = device.index();
+    const at::cuda::OptionalCUDAGuard device_guard(device);
+    auto stream = at::cuda::getCurrentCUDAStream(device_id);
+
+    int num_experts = static_cast<int>(expert_offsets.size(0));
+    int n = static_cast<int>(b_tensors.size(1));
+    int k = static_cast<int>(b_tensors.size(2)) * PackFactor;
+
+    auto options_int =
+        torch::TensorOptions().dtype(torch::kInt64).device(device);
+    torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
+    torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
+    torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
+    torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
+    torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
+    torch::Tensor b_group_scales_ptrs = torch::empty(num_experts, options_int);
+
+    // get the correct offsets to pass to gemm
+    run_get_group_gemm_starts(expert_offsets, a_ptrs, b_ptrs, out_ptrs,
+                              a_scales_ptrs, b_scales_ptrs, b_group_scales_ptrs,
+                              a_tensors, b_tensors, out_tensors, a_scales,
+                              b_scales, b_group_scales, b_group_size);
+
+    // construct args
+    using Args = typename GemmShuffled::Arguments;
+    using MainloopArguments = typename GemmKernelShuffled::MainloopArguments;
+    using EpilogueArguments = typename GemmKernelShuffled::EpilogueArguments;
+    Args arguments;
+
+    ProblemShape::UnderlyingProblemShape* problem_sizes_as_shapes =
+        static_cast<ProblemShape::UnderlyingProblemShape*>(
+            problem_sizes_torch.data_ptr());
+    ProblemShape prob_shape{num_experts, problem_sizes_as_shapes, nullptr};
+
+    // SwapAB so B operands come first
+    MainloopArguments mainloop_arguments{
+        static_cast<const QuantType**>(b_ptrs.data_ptr()),
+        static_cast<LayoutB_Reordered*>(b_strides.data_ptr()),
+        static_cast<const MmaType**>(a_ptrs.data_ptr()),
+        static_cast<StrideA*>(a_strides.data_ptr()),
+        static_cast<const cutlass::Array<ElementScale, 8>**>(
+            b_group_scales_ptrs.data_ptr()),
+        static_cast<StrideS*>(group_scale_strides.data_ptr()),
+        static_cast<int>(b_group_size)};
+
+    EpilogueArguments epilogue_arguments{
+        // since we are doing SwapAB the channel scales comes first, then token
+        // scales
+        ChTokScalesEpilogue::prepare_args(  // see ScaledEpilogueArray
+            static_cast<const ElementAccumulator**>(
+                b_scales_ptrs.data_ptr()),  // per-channel
+            static_cast<const ElementAccumulator**>(
+                a_scales_ptrs.data_ptr()),  // per-token
+            true, true),
+        nullptr,                                       // C
+        static_cast<StrideC*>(c_strides.data_ptr()),   // C
+        static_cast<ElementD**>(out_ptrs.data_ptr()),  // D
+        static_cast<StrideC*>(c_strides.data_ptr())    // D
+    };
+
+    static const cutlass::KernelHardwareInfo hw_info{
+        device_id,
+        cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+            device_id)};
+
+    arguments = Args{cutlass::gemm::GemmUniversalMode::kGrouped, prob_shape,
+                     mainloop_arguments, epilogue_arguments, hw_info};
+
+    // Allocate workspace
+    size_t workspace_size = GemmShuffled::get_workspace_size(arguments);
+    torch::Tensor workspace =
+        torch::empty(workspace_size,
+                     torch::TensorOptions().dtype(torch::kU8).device(device));
+
+    // Run GEMM
+    GemmShuffled gemm;
+    CUTLASS_CHECK(gemm.can_implement(arguments));
+    CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream));
+    CUTLASS_CHECK(gemm.run(stream));
+  }
+};
+
+// ----------------------------------------------------------------------------
+// Kernel instantiations and dispatch logic
+// ----------------------------------------------------------------------------
+using Coop = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative;
+using CoopEpi = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative;
+
+// Kernel_TileShape_ClusterShape_Schedule
+using Kernel_128x16_1x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_128, _16>, Shape<_1, _1, _1>, Coop, CoopEpi>;
+using Kernel_128x16_2x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_128, _16>, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+using Kernel_256x16_1x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _16>, Shape<_1, _1, _1>, Coop, CoopEpi>;
+using Kernel_256x16_2x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _16>, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+using Kernel_256x32_1x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _32>, Shape<_1, _1, _1>, Coop, CoopEpi>;
+using Kernel_256x32_2x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _32>, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+using Kernel_256x64_1x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _64>, Shape<_1, _1, _1>, Coop, CoopEpi>;
+using Kernel_256x64_2x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _64>, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+using Kernel_256x128_1x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _128>, Shape<_1, _1, _1>, Coop, CoopEpi>;
+using Kernel_256x128_2x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_256, _128>, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+using Kernel_128x256_2x1x1_Coop =
+    W4A8GroupedGemmKernel<Shape<_128, _256>, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+void mm_dispatch(
+    torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
+    const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
+    const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
+    const int64_t b_group_size, const torch::Tensor& expert_offsets,
+    const torch::Tensor& problem_sizes, const torch::Tensor& a_strides,
+    const torch::Tensor& b_strides, const torch::Tensor& c_strides,
+    const torch::Tensor& group_scale_strides, const std::string& schedule) {
+  if (schedule == "Kernel_128x16_1x1x1_Coop") {
+    Kernel_128x16_1x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_128x16_2x1x1_Coop") {
+    Kernel_128x16_2x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x16_1x1x1_Coop") {
+    Kernel_256x16_1x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x16_2x1x1_Coop") {
+    Kernel_256x16_2x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x32_1x1x1_Coop") {
+    Kernel_256x32_1x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x32_2x1x1_Coop") {
+    Kernel_256x32_2x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x64_1x1x1_Coop") {
+    Kernel_256x64_1x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x64_2x1x1_Coop") {
+    Kernel_256x64_2x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x128_1x1x1_Coop") {
+    Kernel_256x128_1x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_256x128_2x1x1_Coop") {
+    Kernel_256x128_2x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else if (schedule == "Kernel_128x256_2x1x1_Coop") {
+    Kernel_128x256_2x1x1_Coop::grouped_mm(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+        c_strides, group_scale_strides);
+  } else {
+    TORCH_CHECK(false,
+                "cutlass_w4a8_moe_mm: unknown schedule string: ", schedule);
+  }
+}
+
+void mm(torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
+        const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
+        const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
+        const int64_t b_group_size, const torch::Tensor& expert_offsets,
+        const torch::Tensor& problem_sizes, const torch::Tensor& a_strides,
+        const torch::Tensor& b_strides, const torch::Tensor& c_strides,
+        const torch::Tensor& group_scale_strides,
+        std::optional<std::string> maybe_schedule) {
+  // user has specified a schedule
+  if (maybe_schedule) {
+    mm_dispatch(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
+                b_group_scales, b_group_size, expert_offsets, problem_sizes,
+                a_strides, b_strides, c_strides, group_scale_strides,
+                *maybe_schedule);
+    return;
+  }
+
+  // use heuristic
+  int m_full = a_tensors.size(0);
+  int n = b_tensors.size(1);
+  int k = b_tensors.size(2) * PackFactor;  // logical k
+  int num_experts = b_tensors.size(0);
+  // per-expert batch size assuming uniform distribution
+  int m_expert = m_full / num_experts;
+
+  std::string schedule;
+  if (m_expert <= 16) {
+    schedule = "Kernel_128x16_2x1x1_Coop";
+  } else if (m_expert <= 32) {
+    schedule = "Kernel_256x32_1x1x1_Coop";
+  } else if (m_expert <= 64) {
+    schedule = "Kernel_256x64_1x1x1_Coop";
+  } else if (m_expert <= 128) {
+    schedule = "Kernel_256x128_2x1x1_Coop";
+  } else {  // m_expert > 128
+    schedule = "Kernel_128x256_2x1x1_Coop";
+  }
+
+  mm_dispatch(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
+              b_group_scales, b_group_size, expert_offsets, problem_sizes,
+              a_strides, b_strides, c_strides, group_scale_strides, schedule);
+}
+
+std::tuple<torch::Tensor, torch::Tensor> encode_and_reorder_int4b(
+    torch::Tensor const& b_tensors) {
+  TORCH_CHECK(b_tensors.dtype() == torch::kInt32);
+  TORCH_CHECK(b_tensors.dim() == 3);  // (experts, n, k)
+  TORCH_CHECK(b_tensors.is_contiguous());
+  TORCH_CHECK(b_tensors.is_cuda());
+
+  int n = static_cast<int>(b_tensors.size(1));
+  int k = static_cast<int>(b_tensors.size(2)) * PackFactor;  // logical k
+
+  // CUTLASS reorder_tensor requires k % 256 == 0 and n % 16 == 0.
+  // These misalignments cause silent OOB unless run under Compute Sanitizer.
+  TORCH_CHECK(k % 256 == 0, "logical k must be divisible by 256");
+  TORCH_CHECK(n % 16 == 0, "n must be divisible by 16");
+
+  // we will store the layout to an int32 tensor;
+  // this is the number of elements we need per layout
+  constexpr size_t layout_width = sizeof(LayoutB_Reordered) / sizeof(int32_t);
+
+  torch::Tensor b_tensors_packed = torch::empty_like(b_tensors);
+  int num_experts = static_cast<int>(b_tensors.size(0));
+
+  auto b_ptr = static_cast<QuantType const*>(b_tensors.const_data_ptr());
+  auto b_packed_ptr = static_cast<QuantType*>(b_tensors_packed.data_ptr());
+
+  // multiply by ull so result does not overflow int32
+  size_t num_int4_elems = 1ull * num_experts * n * k;
+  bool ok = vllm::cutlass_w4a8_utils::unified_encode_int4b(b_ptr, b_packed_ptr,
+                                                           num_int4_elems);
+  TORCH_CHECK(ok, "unified_encode_int4b failed");
+
+  // construct the layout once; assumes each expert has the same layout
+  using LayoutType = LayoutB_Reordered;
+  std::vector<LayoutType> layout_B_reordered_host(num_experts);
+  auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {n, k, Int<1>{}});
+  auto shape_B = cute::make_shape(n, k, Int<1>{});
+  auto layout_B = make_layout(shape_B, stride_B);
+  LayoutType layout_B_reordered = tile_to_shape(LayoutAtomQuant{}, shape_B);
+
+  // reorder weights for each expert
+  for (int i = 0; i < num_experts; i++) {
+    // since the storage type of int4b is 1 byte but one element is 4 bits
+    // we need to adjust the offset
+    int64_t offset =
+        1ull * i * n * k * cutlass::sizeof_bits<QuantType>::value / 8;
+    cutlass::reorder_tensor(b_packed_ptr + offset, layout_B,
+                            layout_B_reordered);
+  }
+
+  // save the packed layout to torch tensor so we can re-use it
+  auto cpu_opts =
+      torch::TensorOptions().dtype(torch::kInt32).device(torch::kCPU);
+  torch::Tensor layout_cpu =
+      torch::empty({num_experts, layout_width}, cpu_opts);
+
+  int32_t* layout_data = layout_cpu.data_ptr<int32_t>();
+  for (int i = 0; i < num_experts; ++i) {
+    std::memcpy(layout_data + i * layout_width,  // dst (int32*)
+                &layout_B_reordered,             // src (LayoutType*)
+                sizeof(LayoutType));             // number of bytes
+  }
+
+  torch::Tensor packed_layout =
+      layout_cpu.to(b_tensors.device(), /*non_blocking=*/false);
+
+  return {b_tensors_packed, packed_layout};
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("cutlass_w4a8_moe_mm", &mm);
+  m.impl("cutlass_encode_and_reorder_int4b_grouped", &encode_and_reorder_int4b);
+}
+
+}  // namespace vllm::cutlass_w4a8_moe
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
+++ b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
@ -7,6 +7,7 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/all.h>
 #include "cutlass_extensions/torch_utils.hpp"
+#include "w4a8_utils.cuh"

 #include "core/registration.h"

@ -395,71 +396,6 @@ torch::Tensor pack_scale_fp8(torch::Tensor const& scales) {
  return packed_scales;
 }

-/*
-  GPU-accelerated implementation of cutlass::unified_encode_int4b.
-  Constructs a lookup table in constant memory to map 8 bits
-  (two 4-bit values) at a time. Assumes memory is contiguous
-  and pointers are 16-byte aligned.
-*/
-__constant__ uint8_t kNibbleLUT[256];
-
-__global__ void unified_encode_int4b_device(const uint8_t* in, uint8_t* out,
-                                            size_t nbytes) {
-  constexpr size_t V = sizeof(uint4);  // 16 bytes
-  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-  const size_t nthreads = size_t(gridDim.x) * blockDim.x;
-  const size_t nvec = nbytes / V;
-
-  // 1-D grid-stride loop over 16-byte chunks
-  for (size_t vec = tid; vec < nvec; vec += nthreads) {
-    uint4 v = reinterpret_cast<const uint4*>(in)[vec];
-    uint8_t* b = reinterpret_cast<uint8_t*>(&v);
-#pragma unroll
-    for (int i = 0; i < int(V); ++i) b[i] = kNibbleLUT[b[i]];
-    reinterpret_cast<uint4*>(out)[vec] = v;
-  }
-}
-
-static bool upload_lut() {
-  std::array<uint8_t, 256> lut{};
-  auto map_nib = [](uint8_t v) -> uint8_t {
-    // 1..7 -> (8 - v); keep 0 and 8..15
-    return (v == 0 || (v & 0x8)) ? v : uint8_t(8 - v);
-  };
-  for (int b = 0; b < 256; ++b) {
-    uint8_t lo = b & 0xF;
-    uint8_t hi = (b >> 4) & 0xF;
-    lut[b] = uint8_t((map_nib(hi) << 4) | map_nib(lo));
-  }
-  cudaError_t e = cudaMemcpyToSymbol(kNibbleLUT, lut.data(), lut.size(),
-                                     /*offset=*/0, cudaMemcpyHostToDevice);
-
-  return (e == cudaSuccess);
-}
-
-static bool unified_encode_int4b(cutlass::int4b_t const* in,
-                                 cutlass::int4b_t* out, size_t num_int4_elems) {
-  // Build/upload LUT
-  if (!upload_lut()) return false;
-
-  static_assert(sizeof(typename cutlass::int4b_t::Storage) == 1,
-                "int4 storage must be 1 byte");
-  const size_t nbytes = num_int4_elems >> 1;
-
-  auto* in_bytes = reinterpret_cast<uint8_t const*>(in);
-  auto* out_bytes = reinterpret_cast<uint8_t*>(out);
-
-  // kernel launch params
-  constexpr int block = 256;
-  const size_t nvec = nbytes / sizeof(uint4);  // # of 16B vectors
-  int grid = int((nvec + block - 1) / block);
-  if (grid == 0) grid = 1;  // ensure we still cover the tail in the kernel
-
-  unified_encode_int4b_device<<<grid, block>>>(in_bytes, out_bytes, nbytes);
-  cudaError_t err = cudaGetLastError();
-  return (err == cudaSuccess);
-}
-
 torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
  TORCH_CHECK(B.dtype() == torch::kInt32);
  TORCH_CHECK(B.dim() == 2);
@ -477,8 +413,8 @@ torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
  LayoutB_Reordered layout_B_reordered =
      cute::tile_to_shape(LayoutAtomQuant{}, shape_B);

-  bool ok =
-      vllm::cutlass_w4a8::unified_encode_int4b(B_ptr, B_packed_ptr, n * k);
+  bool ok = vllm::cutlass_w4a8_utils::unified_encode_int4b(B_ptr, B_packed_ptr,
+                                                           n * k);
  TORCH_CHECK(ok, "unified_encode_int4b failed");
  cutlass::reorder_tensor(B_packed_ptr, layout_B, layout_B_reordered);

--- a/csrc/quantization/cutlass_w4a8/w4a8_utils.cu
+++ b/csrc/quantization/cutlass_w4a8/w4a8_utils.cu
@ -0,0 +1,90 @@
+#include "w4a8_utils.cuh"
+
+#include <array>
+#include <cuda_runtime.h>
+#include <cstdio>
+
+namespace vllm::cutlass_w4a8_utils {
+
+/*
+  GPU-accelerated implementation of cutlass::unified_encode_int4b.
+  Constructs a lookup table in constant memory to map 8 bits
+  (two 4-bit values) at a time. Assumes memory is contiguous
+  and pointers are 16-byte aligned.
+*/
+__constant__ uint8_t kNibbleLUT[256];
+
+__global__ void unified_encode_int4b_device(const uint8_t* in, uint8_t* out,
+                                            size_t nbytes) {
+  constexpr size_t V = sizeof(uint4);  // 16 bytes
+  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const size_t nthreads = size_t(gridDim.x) * blockDim.x;
+  const size_t nvec = nbytes / V;
+
+  // 1-D grid-stride loop over 16-byte chunks
+  for (size_t vec = tid; vec < nvec; vec += nthreads) {
+    uint4 v = reinterpret_cast<const uint4*>(in)[vec];
+    uint8_t* b = reinterpret_cast<uint8_t*>(&v);
+#pragma unroll
+    for (int i = 0; i < int(V); ++i) b[i] = kNibbleLUT[b[i]];
+    reinterpret_cast<uint4*>(out)[vec] = v;
+  }
+}
+
+static bool upload_lut() {
+  std::array<uint8_t, 256> lut{};
+  auto map_nib = [](uint8_t v) -> uint8_t {
+    // 1..7 -> (8 - v); keep 0 and 8..15
+    return (v == 0 || (v & 0x8)) ? v : uint8_t(8 - v);
+  };
+  for (int b = 0; b < 256; ++b) {
+    uint8_t lo = b & 0xF;
+    uint8_t hi = (b >> 4) & 0xF;
+    lut[b] = uint8_t((map_nib(hi) << 4) | map_nib(lo));
+  }
+  cudaError_t e = cudaMemcpyToSymbol(kNibbleLUT, lut.data(), lut.size(),
+                                     /*offset=*/0, cudaMemcpyHostToDevice);
+
+  return (e == cudaSuccess);
+}
+
+bool unified_encode_int4b(cutlass::int4b_t const* in, cutlass::int4b_t* out,
+                          size_t num_int4_elems) {
+  // Build/upload LUT
+  if (!upload_lut()) return false;
+
+  static_assert(sizeof(typename cutlass::int4b_t::Storage) == 1,
+                "int4 storage must be 1 byte");
+  const size_t nbytes = num_int4_elems >> 1;
+
+  auto* in_bytes = reinterpret_cast<uint8_t const*>(in);
+  auto* out_bytes = reinterpret_cast<uint8_t*>(out);
+
+  // kernel launch params
+  constexpr int block = 256;
+  const size_t nvec = nbytes / sizeof(uint4);  // # of 16B vectors
+  int grid = int((nvec + block - 1) / block);
+  if (grid == 0) grid = 1;  // ensure we still cover the tail in the kernel
+
+  unified_encode_int4b_device<<<grid, block>>>(in_bytes, out_bytes, nbytes);
+
+  // launch errors
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("unified_encode_int4b_device launch error: %s (%d)\n",
+           cudaGetErrorString(err), err);
+    return false;
+  }
+
+  // runtime errors
+  err = cudaDeviceSynchronize();
+  if (err != cudaSuccess) {
+    printf("unified_encode_int4b_device runtime error: %s (%d)\n",
+           cudaGetErrorString(err), err);
+    return false;
+  }
+
+  return true;
+}
+
+}  // namespace vllm::cutlass_w4a8_utils
--- a/csrc/quantization/cutlass_w4a8/w4a8_utils.cuh
+++ b/csrc/quantization/cutlass_w4a8/w4a8_utils.cuh
@ -0,0 +1,11 @@
+#pragma once
+
+#include <cstddef>
+#include "cutlass/numeric_types.h"
+
+namespace vllm::cutlass_w4a8_utils {
+
+bool unified_encode_int4b(cutlass::int4b_t const* in, cutlass::int4b_t* out,
+                          size_t num_int4_elems);
+
+}  // namespace vllm::cutlass_w4a8_utils
--- a/csrc/quantization/machete/machete_mainloop.cuh
+++ b/csrc/quantization/machete/machete_mainloop.cuh
@ -617,7 +617,7 @@ struct MacheteCollectiveMma {

  // Same as upstream, should be kept the same when possible, not formatted for
  // easier comparison
-  //   with `SwapAB ? N : M -> M` since we dont support SwapAB
+  //   with `SwapAB ? N : M -> M` since we don't support SwapAB
  // clang-format off
  template<class ProblemShape>
  static bool
--- a/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
+++ b/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
@ -136,15 +136,17 @@ inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids,
 void get_cutlass_moe_mm_problem_sizes_caller(
    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
+    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
+    std::optional<bool> force_swap_ab = std::nullopt) {
  auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
  auto options_int32 =
      torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
  torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);

  // Swap-AB should be disabled for FP4 path
-  bool may_swap_ab = (!blockscale_offsets.has_value()) &&
-                     (topk_ids.numel() <= SWAP_AB_THRESHOLD);
+  bool may_swap_ab =
+      force_swap_ab.value_or((!blockscale_offsets.has_value()) &&
+                             (topk_ids.numel() <= SWAP_AB_THRESHOLD));

  launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2,
                               atomic_buffer, num_experts, n, k, stream,
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
@ -80,7 +80,8 @@ void get_cutlass_moe_mm_data_caller(
 void get_cutlass_moe_mm_problem_sizes_caller(
    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets);
+    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
+    std::optional<bool> force_swap_ab = std::nullopt);

 void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
                                         torch::Tensor& problem_sizes1,
@ -303,14 +304,15 @@ void get_cutlass_moe_mm_data(
 void get_cutlass_moe_mm_problem_sizes(
    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
+    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
+    std::optional<bool> force_swap_ab = std::nullopt) {
  int32_t version_num = get_sm_version_num();
 #if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
    (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
  get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
                                          problem_sizes2, num_experts, n, k,
-                                          blockscale_offsets);
+                                          blockscale_offsets, force_swap_ab);
  return;
 #endif
  TORCH_CHECK_NOT_IMPLEMENTED(
--- a/csrc/rocm/skinny_gemms.cu
+++ b/csrc/rocm/skinny_gemms.cu
@ -1241,33 +1241,16 @@ __global__ void wvSplitK_hf_big_(const int K, const int M, const int Bx,
 }
 #endif  // defined(__HIP__GFX9__) TODO: Add NAVI support

+// Find the min val of div2 that doesn't increase N/(div1*div2)
 int mindiv(int N, int div1, int div2) {
  int nPrRnd = div1 * div2;
-  int rnds0 = N / nPrRnd;
-  nPrRnd -= div1 * 3;
-  int rnds3 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rnds4 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rnds5 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rnds6 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rnds7 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rnds8 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rnds9 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rtn = div2;
-  if (rnds0 == rnds3) rtn = div2 - 3;
-  if (rnds0 == rnds4) rtn = div2 - 4;
-  if (rnds0 == rnds5) rtn = div2 - 5;
-  if (rnds0 == rnds6) rtn = div2 - 6;
-  if (rnds0 == rnds7) rtn = div2 - 7;
-  if (rnds0 == rnds8) rtn = div2 - 8;
-  if (rnds0 == rnds9) rtn = div2 - 9;
-  return rtn;
+  int rnds[13];
+  for (int i = 0; i < 13; i++) {
+    rnds[i] = (N + nPrRnd - 1) / nPrRnd;
+    nPrRnd -= div1;
+  }
+  for (int i = 12; i >= 0; i--)
+    if (rnds[0] == rnds[i]) return (div2 - i);
 }

 torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
@ -1300,26 +1283,37 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  const int max_lds_len = get_lds_size() / 2;

-#define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
-                 _N)                                                          \
-  {                                                                           \
-    dim3 block(64, _WvPrGrp);                                                 \
-    if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) {              \
-      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp);              \
-      wvSplitK_hf_sml_<fptype, 64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N>          \
-          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4,    \
-                                       biasf4, c, __wvPrGrp, CuCount);        \
-    } else if (K_in * N_in <= max_lds_len * 1.2) {                            \
-      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp);              \
-      wvSplitK_hf_<fptype, 64, _YTILEm, _WvPrGrp, 8, _UNRLm, _N>              \
-          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4,    \
-                                       biasf4, c, __wvPrGrp, CuCount);        \
-    } else {                                                                  \
-      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEb, _WvPrGrp);              \
-      wvSplitK_hf_big_<fptype, 64, _YTILEb, _WvPrGrp, 8, _UNRLb, _N>          \
-          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4,    \
-                                       biasf4, c, __wvPrGrp, CuCount);        \
-    }                                                                         \
+#define WVSPLITK(_YTILE, _UNRL, _N)                                        \
+  {                                                                        \
+    dim3 block(64, 16);                                                    \
+    int __wvPrGrp = mindiv(M_in, CuCount * _YTILE, 16);                    \
+    if ((K_in * N_in <= max_lds_len) && (M_in % _YTILE == 0))              \
+      wvSplitK_hf_sml_<fptype, 64, _YTILE, 16, 8, _UNRL, _N>               \
+          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
+                                       biasf4, c, __wvPrGrp, CuCount);     \
+    else if (K_in * N_in <= max_lds_len * 1.2)                             \
+      wvSplitK_hf_<fptype, 64, _YTILE, 16, 8, _UNRL, _N>                   \
+          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
+                                       biasf4, c, __wvPrGrp, CuCount);     \
+    else                                                                   \
+      wvSplitK_hf_big_<fptype, 64, _YTILE, 16, 8, _UNRL, _N>               \
+          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
+                                       biasf4, c, __wvPrGrp, CuCount);     \
+  }
+
+#define WVSPLIT_TILE(_sYT, __N)                           \
+  {                                                       \
+    bool fit_lds = (K_in * N_in <= max_lds_len);          \
+    if (_sYT <= 1)                                        \
+      WVSPLITK(1, 4, __N)                                 \
+    else if ((__N == 1) || (!fit_lds) || (_sYT <= 4 * 2)) \
+      WVSPLITK(2, 2, __N)                                 \
+    else if (_sYT <= 4 * 3)                               \
+      WVSPLITK(3, 2, __N)                                 \
+    else if (__N == 4)                                    \
+      WVSPLITK(4, 1, __N)                                 \
+    else                                                  \
+      WVSPLITK(4, 2, __N)                                 \
  }

  AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitK", [&] {
@ -1331,18 +1325,23 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
            ? reinterpret_cast<const fptype*>(in_bias->data_ptr())
            : nullptr;
    fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
+
+    // first shoot for biggest tile-size that keeps all simd busy,
+    // then cut the active waves to balance their distribution...
+    int sYT = (M_in + CuCount * 4 - 1) / (CuCount * 4);
+
    switch (N_in) {
      case 1:
-        WVSPLITK(16, 2, 2, 2, 2, 2, 2, 1)
+        WVSPLIT_TILE(sYT, 1)
        break;
      case 2:
-        WVSPLITK(16, 2, 2, 2, 2, 2, 2, 2)
+        WVSPLIT_TILE(sYT, 2)
        break;
      case 3:
-        WVSPLITK(16, 4, 7, 7, 1, 1, 1, 3)
+        WVSPLIT_TILE(sYT, 3)
        break;
      case 4:
-        WVSPLITK(16, 4, 7, 7, 1, 1, 1, 4)
+        WVSPLIT_TILE(sYT, 4)
        break;
      default:
        throw std::runtime_error(
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -350,6 +350,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.def("cutlass_encode_and_reorder_int4b(Tensor B) -> Tensor");
  // conditionally compiled so impl registration is in source file

+  // CUTLASS w4a8 grouped GEMM
+  ops.def(
+      "cutlass_w4a8_moe_mm("
+      "   Tensor! out_tensors,"
+      "   Tensor a_tensors,"
+      "   Tensor b_tensors,"
+      "   Tensor a_scales,"
+      "   Tensor b_scales,"
+      "   Tensor b_group_scales,"
+      "   int b_group_size,"
+      "   Tensor expert_offsets,"
+      "   Tensor problem_sizes,"
+      "   Tensor a_strides,"
+      "   Tensor b_strides,"
+      "   Tensor c_strides,"
+      "   Tensor group_scale_strides,"
+      "   str? maybe_schedule"
+      ") -> ()");
+  ops.def(
+      "cutlass_encode_and_reorder_int4b_grouped(Tensor b_tensors) -> (Tensor, "
+      "Tensor)");
+  // conditionally compiled so impl registration is in source file
+
 #endif

  // Dequantization for GGML.
@ -466,7 +489,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "                                 Tensor! problem_sizes1, "
      "                                 Tensor! problem_sizes2, "
      "                                 int num_experts, int n, int k, "
-      "                                 Tensor? blockscale_offsets) -> ()");
+      "                                 Tensor? blockscale_offsets, "
+      "                                 bool? force_swap_ab) -> ()");
  ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA,
           &get_cutlass_moe_mm_problem_sizes);

--- a/docs/api/README.md
+++ b/docs/api/README.md
@ -15,6 +15,7 @@ API documentation for vLLM's configuration classes.
 - [vllm.config.MultiModalConfig][]
 - [vllm.config.PoolerConfig][]
 - [vllm.config.StructuredOutputsConfig][]
+- [vllm.config.ProfilerConfig][]
 - [vllm.config.ObservabilityConfig][]
 - [vllm.config.KVTransferConfig][]
 - [vllm.config.CompilationConfig][]
--- a/docs/benchmarking/cli.md
+++ b/docs/benchmarking/cli.md
@ -84,7 +84,7 @@ Total input tokens:                      1369
 Total generated tokens:                  2212
 Request throughput (req/s):              1.73
 Output token throughput (tok/s):         382.89
-Total Token throughput (tok/s):          619.85
+Total token throughput (tok/s):          619.85
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          71.54
 Median TTFT (ms):                        73.88
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@ -5,16 +5,15 @@

 ## Profile with PyTorch Profiler

-We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`. Additionally, you can control the profiling content by specifying the following environment variables:
+We support tracing vLLM workers using the `torch.profiler` module. You can enable the torch profiler by setting `--profiler-config`
+when launching the server, and setting the entries `profiler` to `'torch'` and `torch_profiler_dir` to the directory where you want to save the traces. Additionally, you can control the profiling content by specifying the following additional arguments in the config:

- `VLLM_TORCH_PROFILER_RECORD_SHAPES=1` to enable recording Tensor Shapes, off by default
- `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1` to record memory, off by default
- `VLLM_TORCH_PROFILER_WITH_STACK=1` to enable recording stack information, on by default
- `VLLM_TORCH_PROFILER_WITH_FLOPS=1` to enable recording FLOPs, off by default
- `VLLM_TORCH_PROFILER_USE_GZIP=0` to disable gzip-compressing profiling files, on by default
- `VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0` to disable dumping and printing the aggregated CUDA self time table, on by default
-
-The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
+- `torch_profiler_record_shapes` to enable recording Tensor Shapes, off by default
+- `torch_profiler_with_memory` to record memory, off by default
+- `torch_profiler_with_stack` to enable recording stack information, on by default
+- `torch_profiler_with_flops` to enable recording FLOPs, off by default
+- `torch_profiler_use_gzip` to control gzip-compressing profiling files, on by default
+- `torch_profiler_dump_cuda_time_total` to control dumping and printing the aggregated CUDA self time table, on by default

 When using `vllm bench serve`, you can enable profiling by passing the `--profile` flag.

@ -40,8 +39,7 @@ Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline
 #### OpenAI Server

 ```bash
-VLLM_TORCH_PROFILER_DIR=./vllm_profile \
-    vllm serve meta-llama/Llama-3.1-8B-Instruct
+vllm serve meta-llama/Llama-3.1-8B-Instruct --profiler-config '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile"}'
 ```

 vllm bench command:
@ -104,13 +102,12 @@ To profile the server, you will want to prepend your `vllm serve` command with `

 ```bash
 # server
-VLLM_TORCH_CUDA_PROFILE=1 \
 nsys profile \
    --trace-fork-before-exec=true \
    --cuda-graph-trace=node \
    --capture-range=cudaProfilerApi \
    --capture-range-end repeat \
-    vllm serve meta-llama/Llama-3.1-8B-Instruct
+    vllm serve meta-llama/Llama-3.1-8B-Instruct --profiler-config.profiler cuda

 # client
 vllm bench serve \
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@ -21,30 +21,20 @@ The mental model is that server-level metrics help explain the values of request

 ### v1 Metrics

-In v1, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix:
+In v1, an extensive set of metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix, for example:

 - `vllm:num_requests_running` (Gauge) - Number of requests currently running.
- `vllm:num_requests_waiting` (Gauge) - Number of requests currently waiting.
 - `vllm:kv_cache_usage_perc` (Gauge) - Fraction of used KV cache blocks (0–1).
 - `vllm:prefix_cache_queries` (Counter) - Number of prefix cache queries.
 - `vllm:prefix_cache_hits` (Counter) - Number of prefix cache hits.
- `vllm:mm_cache_queries` (Counter) - (For multimodal models) Number of multimodal cache queries.
- `vllm:mm_cache_hits` (Counter) - (For multimodal models) Number of multimodal cache hits.
- `vllm:num_preemptions_total` (Counter) - Number of preemptions.
 - `vllm:prompt_tokens_total` (Counter) - Total number of prompt tokens processed.
 - `vllm:generation_tokens_total` (Counter) - Total number of generated tokens.
- `vllm:iteration_tokens_total` (Histogram) - Histogram of tokens processed in each engine step.
- `vllm:cache_config_info` (Gauge) - Information about the cache configuration.
 - `vllm:request_success_total` (Counter) - Number of finished requests (by finish reason).
 - `vllm:request_prompt_tokens` (Histogram) - Histogram of input prompt token counts.
 - `vllm:request_generation_tokens` (Histogram) - Histogram of generation token counts.
- `vllm:request_params_n` (Histogram) - Histogram of request parameter n.
- `vllm:request_params_max_tokens` - (Histogram) - Histogram of max_tokens parameter in requests.
 - `vllm:time_to_first_token_seconds` (Histogram) - Time to first token (TTFT).
 - `vllm:inter_token_latency_seconds` (Histogram) - Inter-token latency.
 - `vllm:e2e_request_latency_seconds` (Histogram) - End-to-end request latency.
- `vllm:request_queue_time_seconds` (Histogram) - Time spent in the queue.
- `vllm:request_inference_time_seconds` (Histogram) - Request inference time.
 - `vllm:request_prefill_time_seconds` (Histogram) - Request prefill time.
 - `vllm:request_decode_time_seconds` (Histogram) - Request decode time.

--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@ -152,5 +152,5 @@ The interface for the model/module may change during vLLM's development. If you
 ## Deprecation announcement

 !!! warning "Deprecations"
-    - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It will be removed in v0.13.0 or v1.0.0.
-    - `_Backend` in `vllm.attention` is deprecated. It will be removed in v0.13.0 or v1.0.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
+    - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0.
+    - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
--- a/docs/features/README.md
+++ b/docs/features/README.md
@ -68,8 +68,8 @@ th:not(:first-child) {
 | CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | [❌](https://github.com/vllm-project/vllm/issues/26970)        |
 | [pooling](../models/pooling_models.md)                    | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
 | <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ✅        |
-| [mm](multimodal_inputs.md)                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | [🟠](https://github.com/vllm-project/vllm/issues/26965)       |
-| [prompt-embeds](prompt_embeds.md)                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❔     | ✅       |
+| [mm](multimodal_inputs.md)                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+| [prompt-embeds](prompt_embeds.md)                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❔     | ✅        |
 | <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
 | <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
 | <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     | ✅        |
--- a/docs/features/disagg_encoder.md
+++ b/docs/features/disagg_encoder.md
@ -32,14 +32,14 @@ Design doc: <https://docs.google.com/document/d/1aed8KtC6XkXtdoV87pWT0a8OJlZ-Cpn

 ## 2  Usage Example

-The current reference pathway is **SharedStorageConnector**.  
+The current reference pathway is **ExampleConnector**.  
 Below ready-to-run scripts shows the workflow:

 1 Encoder instance + 1 PD instance:
-`examples/online_serving/disaggregated_encoder/shared_storage_connector/disagg_encoder_example.sh`
+`examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh`

 1 Encoder instance + 1 Prefill instance + 1 Decode instance:
-`examples/online_serving/disaggregated_encoder/shared_storage_connector/disagg_epd_example.sh`
+`examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh`

 ---

--- a/docs/features/disagg_prefill.md
+++ b/docs/features/disagg_prefill.md
@ -21,14 +21,14 @@ Please refer to [examples/online_serving/disaggregated_prefill.sh](../../example

 Now supports 5 types of connectors:

- **SharedStorageConnector**: refer to [examples/offline_inference/disaggregated-prefill-v1/run.sh](../../examples/offline_inference/disaggregated-prefill-v1/run.sh) for the example usage of SharedStorageConnector disaggregated prefilling.
+- **ExampleConnector**: refer to [examples/offline_inference/disaggregated-prefill-v1/run.sh](../../examples/offline_inference/disaggregated-prefill-v1/run.sh) for the example usage of ExampleConnector disaggregated prefilling.
 - **LMCacheConnectorV1**: refer to [examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh](../../examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh) for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
 - **NixlConnector**: refer to [tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh) for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. For detailed usage guide, see [NixlConnector Usage Guide](nixl_connector_usage.md).
 - **P2pNcclConnector**: refer to [examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh](../../examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh) for the example usage of P2pNcclConnector disaggregated prefilling.
 - **MultiConnector**: take advantage of the kv_connector_extra_config: dict[str, Any] already present in KVTransferConfig to stash all the connectors we want in an ordered list of kwargs.such as:

  ```bash
-  --kv-transfer-config '{"kv_connector":"MultiConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"NixlConnector","kv_role":"kv_both"},{"kv_connector":"SharedStorageConnector","kv_role":"kv_both","kv_connector_extra_config":{"shared_storage_path":"local_storage"}}]}}'
+  --kv-transfer-config '{"kv_connector":"MultiConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"NixlConnector","kv_role":"kv_both"},{"kv_connector":"ExampleConnector","kv_role":"kv_both","kv_connector_extra_config":{"shared_storage_path":"local_storage"}}]}}'
  ```

 For NixlConnector, you may also specify one or multiple NIXL_Backend. Such as:
--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@ -22,7 +22,7 @@ python tools/install_nixl_from_source_ubuntu.py
 NixlConnector uses NIXL library for underlying communication, which supports multiple transport backends. UCX (Unified Communication X) is the primary default transport library used by NIXL. Configure transport environment variables:

 ```bash
-# Example UCX configuration, adjust according to your enviroment
+# Example UCX configuration, adjust according to your environment
 export UCX_TLS=all  # or specify specific transports like "rc,ud,sm,^cuda_ipc" ..etc
 export UCX_NET_DEVICES=all  # or specify network devices like "mlx5_0:1,mlx5_1:1"
 ```
@ -146,6 +146,8 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
  --decoder-ports 8000 8000
 ```

+For multi-host DP deployment, only need to provide the host/port of the head instances.
+
 ### KV Role Options

 - **kv_producer**: For prefiller instances that generate KV caches
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@ -299,6 +299,9 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner

        def is_reasoning_end(self, input_ids: list[int]) -> bool:
            return self.end_token_id in input_ids
+
+        def is_reasoning_end_streaming(self, input_ids: list[int], delta_ids: list[int]) -> bool:
+            return self.end_token_id in delta_token_ids
        ...
    ```

--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@ -61,7 +61,7 @@ Now let´s see an example for each of the cases, starting with the `choice`, as
    print(completion.choices[0].message.content)
    ```

-The next example shows how to use the `regex`. The idea is to generate an email address, given a simple regex template:
+The next example shows how to use the `regex`. The supported regex syntax depends on the structured output backend. For example, `xgrammar`, `guidance`, and `outlines` use Rust-style regex, while `lm-format-enforcer` uses Python's `re` module. The idea is to generate an email address, given a simple regex template:

 ??? code

--- a/docs/getting_started/installation/README.md
+++ b/docs/getting_started/installation/README.md
@ -26,3 +26,4 @@ The backends below live **outside** the main `vllm` repository and follow the
 | Rebellions ATOM / REBEL NPU | `vllm-rbln` | <https://github.com/rebellions-sw/vllm-rbln> |
 | IBM Spyre AIU | `vllm-spyre` | <https://github.com/vllm-project/vllm-spyre> |
 | Cambricon MLU | `vllm-mlu` | <https://github.com/Cambricon/vllm-mlu> |
+| Baidu Kunlun XPU | N/A, install from source | <https://github.com/baidu/vLLM-Kunlun> |
--- a/docs/getting_started/installation/cpu.arm.inc.md
+++ b/docs/getting_started/installation/cpu.arm.inc.md
@ -29,8 +29,27 @@ uv pip install --pre vllm==<version>+cpu --extra-index-url https://wheels.vllm.a

 The `uv` approach works for vLLM `v0.6.6` and later. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.

-!!! note
-    Nightly wheels are currently unsupported for this architecture. (e.g. to bisect the behavior change, performance regression).
+**Install the latest code**
+
+LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides working pre-built Arm CPU wheels for every commit since `v0.11.2` on <https://wheels.vllm.ai/nightly>. For native CPU wheels, this index should be used:
+
+* `https://wheels.vllm.ai/nightly/cpu/vllm`
+
+To install from nightly index, copy the link address of the `*.whl` under this index to run, for example:
+
+```bash
+uv pip install -U https://wheels.vllm.ai/c756fb678184b867ed94e5613a529198f1aee423/vllm-0.13.0rc2.dev11%2Bgc756fb678.cpu-cp38-abi3-manylinux_2_31_aarch64.whl # current nightly build (the filename will change!)
+```
+
+**Install specific revisions**
+
+If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), specify the full commit hash in the index:
+https://wheels.vllm.ai/${VLLM_COMMIT}/cpu/vllm .
+Then, copy the link address of the `*.whl` under this index to run:
+
+```bash
+uv pip install -U <wheel-url>
+```

 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
--- a/docs/mkdocs/hooks/generate_metrics.py
+++ b/docs/mkdocs/hooks/generate_metrics.py
@ -0,0 +1,149 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import ast
+import logging
+from pathlib import Path
+from typing import Literal
+
+logger = logging.getLogger("mkdocs")
+
+ROOT_DIR = Path(__file__).parent.parent.parent.parent
+DOCS_DIR = ROOT_DIR / "docs"
+GENERATED_METRICS_DIR = DOCS_DIR / "generated" / "metrics"
+
+# Files to scan for metric definitions - each will generate a separate table
+METRIC_SOURCE_FILES = [
+    {"path": "vllm/v1/metrics/loggers.py", "output": "general.md"},
+    {
+        "path": "vllm/v1/spec_decode/metrics.py",
+        "output": "spec_decode.md",
+    },
+    {
+        "path": "vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py",
+        "output": "nixl_connector.md",
+    },
+]
+
+
+class MetricExtractor(ast.NodeVisitor):
+    """AST visitor to extract metric definitions."""
+
+    def __init__(self):
+        self.metrics: list[dict[str, str]] = []
+
+    def visit_Call(self, node: ast.Call) -> None:
+        """Visit function calls to find metric class instantiations."""
+        metric_type = self._get_metric_type(node)
+        if metric_type:
+            name = self._extract_kwarg(node, "name")
+            documentation = self._extract_kwarg(node, "documentation")
+
+            if name:
+                self.metrics.append(
+                    {
+                        "name": name,
+                        "type": metric_type,
+                        "documentation": documentation or "",
+                    }
+                )
+
+        self.generic_visit(node)
+
+    def _get_metric_type(self, node: ast.Call) -> str | None:
+        """Determine if this call creates a metric and return its type."""
+        metric_type_map = {
+            "_gauge_cls": "gauge",
+            "_counter_cls": "counter",
+            "_histogram_cls": "histogram",
+        }
+        if isinstance(node.func, ast.Attribute):
+            return metric_type_map.get(node.func.attr)
+        return None
+
+    def _extract_kwarg(self, node: ast.Call, key: str) -> str | None:
+        """Extract a keyword argument value from a function call."""
+        for keyword in node.keywords:
+            if keyword.arg == key:
+                return self._get_string_value(keyword.value)
+        return None
+
+    def _get_string_value(self, node: ast.AST) -> str | None:
+        """Extract string value from an AST node."""
+        if isinstance(node, ast.Constant):
+            return str(node.value) if node.value is not None else None
+        return None
+
+
+def extract_metrics_from_file(filepath: Path) -> list[dict[str, str]]:
+    """Parse a Python file and extract all metric definitions."""
+    try:
+        with open(filepath, encoding="utf-8") as f:
+            source = f.read()
+
+        tree = ast.parse(source, filename=str(filepath))
+        extractor = MetricExtractor()
+        extractor.visit(tree)
+        return extractor.metrics
+    except Exception as e:
+        raise RuntimeError(f"Failed to parse {filepath}: {e}") from e
+
+
+def generate_markdown_table(metrics: list[dict[str, str]]) -> str:
+    """Generate a markdown table from extracted metrics."""
+    if not metrics:
+        return "No metrics found.\n"
+
+    # Sort by type, then by name
+    metrics_sorted = sorted(metrics, key=lambda m: (m["type"], m["name"]))
+
+    lines = []
+    lines.append("| Metric Name | Type | Description |")
+    lines.append("|-------------|------|-------------|")
+
+    for metric in metrics_sorted:
+        name = metric["name"]
+        metric_type = metric["type"].capitalize()
+        doc = metric["documentation"].replace("\n", " ").strip()
+        lines.append(f"| `{name}` | {metric_type} | {doc} |")
+
+    return "\n".join(lines) + "\n"
+
+
+def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
+    """Generate metrics documentation tables from source files."""
+    logger.info("Generating metrics documentation")
+
+    # Create generated directory if it doesn't exist
+    GENERATED_METRICS_DIR.mkdir(parents=True, exist_ok=True)
+
+    total_metrics = 0
+    for source_config in METRIC_SOURCE_FILES:
+        source_path = source_config["path"]
+        output_file = source_config["output"]
+
+        filepath = ROOT_DIR / source_path
+        if not filepath.exists():
+            raise FileNotFoundError(f"Metrics source file not found: {filepath}")
+
+        logger.debug("Extracting metrics from: %s", source_path)
+        metrics = extract_metrics_from_file(filepath)
+        logger.debug("Found %d metrics in %s", len(metrics), source_path)
+
+        # Generate and write the markdown table for this source
+        table_content = generate_markdown_table(metrics)
+        output_path = GENERATED_METRICS_DIR / output_file
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write(table_content)
+
+        total_metrics += len(metrics)
+        logger.info(
+            "Generated metrics table: %s (%d metrics)",
+            output_path.relative_to(ROOT_DIR),
+            len(metrics),
+        )
+
+    logger.info(
+        "Total metrics generated: %d across %d files",
+        total_metrics,
+        len(METRIC_SOURCE_FILES),
+    )
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@ -316,10 +316,13 @@ We have split the `encode` task into two more specific token-wise tasks: `token_

 ### Remove softmax from PoolingParams

-We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function.
+We are going to remove `softmax` and `activation` from `PoolingParams` in v0.15. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function.

 ### as_reward_model

+!!! warning
+    We are going to remove `--convert reward` in v0.15, use `--convert embed` instead.
+
 Pooling models now default support all pooling, you can use it without any settings.

 - Extracting hidden states prefers using `token_embed` task.
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -568,7 +568,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
    ```

 !!! note
-    Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/qwen3_reranker.py](../../examples/pooling/score/qwen3_reranker.py).
+    Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/offline_reranker.py](../../examples/pooling/score/offline_reranker.py).

    ```bash
    vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
--- a/docs/serving/data_parallel_deployment.md
+++ b/docs/serving/data_parallel_deployment.md
@ -24,7 +24,7 @@ There are two distinct modes supported for online deployments - self-contained w

 vLLM supports "self-contained" data parallel deployments that expose a single API endpoint.

-It can be configured by simply including e.g. `--data-parallel-size=4` in the vllm serve command line arguments. This will require 4 GPUs. It can be combined with tensor parallel, for example `--data-parallel-size=4 --tensor-parallel-size=2`, which would require 8 GPUs.
+It can be configured by simply including e.g. `--data-parallel-size=4` in the vllm serve command line arguments. This will require 4 GPUs. It can be combined with tensor parallel, for example `--data-parallel-size=4 --tensor-parallel-size=2`, which would require 8 GPUs. When sizing DP deployments, remember that `--max-num-seqs` applies per DP rank.

 Running a single data parallel deployment across multiple nodes requires a different `vllm serve` to be run on each node, specifying which DP ranks should run on that node. In this case, there will still be a single HTTP entrypoint - the API server(s) will run only on one node, but it doesn't necessarily need to be co-located with the DP ranks.

@ -80,6 +80,18 @@ When deploying large DP sizes using this method, the API server process can beco
 ![DP Internal LB Diagram](../assets/deployment/dp_internal_lb.png)
 </figure>

+## Hybrid Load Balancing
+
+Hybrid load balancing sits between the internal and external approaches. Each node runs its own API server(s) that only queue requests to the data-parallel engines colocated on that node. An upstream load balancer (for example, an ingress controller or traffic router) spreads user requests across those per-node endpoints.
+
+Enable this mode with `--data-parallel-hybrid-lb` while still launching every node with the global data-parallel size. The key differences from internal load balancing are:
+
+- You must provide `--data-parallel-size-local` and `--data-parallel-start-rank` so each node knows which ranks it owns.
+- Not compatible with `--headless` since every node exposes an API endpoint.
+- Scale `--api-server-count` per node based on the number of local ranks
+
+In this configuration, each node keeps scheduling decisions local, which reduces cross-node traffic and avoids single node bottlenecks at larger DP sizes.
+
 ## External Load Balancing

 For larger scale deployments especially, it can make sense to handle the orchestration and load balancing of data parallel ranks externally.
--- a/docs/serving/expert_parallel_deployment.md
+++ b/docs/serving/expert_parallel_deployment.md
@ -40,10 +40,12 @@ EP_SIZE = TP_SIZE × DP_SIZE

 Where:

- `TP_SIZE`: Tensor parallel size (always 1 for now)
+- `TP_SIZE`: Tensor parallel size
 - `DP_SIZE`: Data parallel size
 - `EP_SIZE`: Expert parallel size (computed automatically)

+When EP is enabled, MoE layers use expert parallelism instead of tensor parallelism, while attention layers continue to use tensor parallelism if `TP_SIZE > 1`.
+
 ### Example Command

 The following command serves a `DeepSeek-V3-0324` model with 1-way tensor parallel, 8-way (attention) data parallel, and 8-way expert parallel. The attention weights are replicated across all GPUs, while the expert weights are split across GPUs. It will work on a H200 (or H20) node with 8 GPUs. For H100, you can try to serve a smaller model or refer to the multi-node deployment section.
@ -81,7 +83,7 @@ vllm serve deepseek-ai/DeepSeek-V3-0324 \
    --data-parallel-size-local 8 \           # Local DP size on this node (8 GPUs per node)
    --data-parallel-address 192.168.1.100 \  # Replace with actual IP of Node 1
    --data-parallel-rpc-port 13345 \         # RPC communication port, can be any port as long as reachable by all nodes
-    --api-server-count=8                     # Number of API servers for load handling (scaling this out to total ranks are recommended)
+    --api-server-count=8                     # Number of API servers for load handling (scaling this out to # local ranks is recommended)

 # Node 2 (Secondary - headless mode, no API server)
 vllm serve deepseek-ai/DeepSeek-V3-0324 \
@ -119,9 +121,6 @@ While MoE models are typically trained so that each expert receives a similar nu

 Enable EPLB with the `--enable-eplb` flag.

-!!! note "Model Support"
-    Currently only DeepSeek V3 architecture is supported.
-
 When enabled, vLLM collects load statistics with every forward pass and periodically rebalances expert distribution.

 ### EPLB Parameters
@ -134,6 +133,8 @@ Configure EPLB with the `--eplb-config` argument, which accepts a JSON string. T
 | `step_interval`| Frequency of rebalancing (every N engine steps) | 3000 |
 | `log_balancedness` | Log balancedness metrics (avg tokens per expert ÷ max tokens per expert) | `false` |
 | `num_redundant_experts` | Additional global experts per EP rank beyond equal distribution | `0` |
+| `use_async` | Use non-blocking EPLB for reduced latency overhead | `false` |
+| `policy` | The policy type for expert parallel load balancing | `"default"` |

 For example:

@ -183,6 +184,26 @@ vllm serve deepseek-ai/DeepSeek-V3-0324 \

 For multi-node deployment, add these EPLB flags to each node's command. We recommend setting `--eplb-config '{"num_redundant_experts":32}'` to 32 in large scale use cases so the most popular experts are always available.

+## Advanced Configuration
+
+### Performance Optimization
+
+- **DeepEP kernels**: The `high_throughput` and `low_latency` kernels are optimized for disaggregated serving and may show poor performance for mixed workloads
+- **Dual Batch Overlap**: Use `--enable-dbo` to overlap all-to-all communication with compute. See [Dual Batch Overlap](../design/dbo.md) for more details.
+- **Async scheduling (experimental)**: Try `--async-scheduling` to overlap scheduling with model execution.
+
+### Troubleshooting
+
+- **`non-zero status: 7 cannot register cq buf`**: When using Infiniband/RoCE, make sure host VM and pods show `ulimit -l` "unlimited".
+- **`init failed for transport: IBGDA`**: The InfiniBand GDA kernel modules are missing. Run `tools/ep_kernels/configure_system_drivers.sh` on each GPU node and reboot. Also fixes error `NVSHMEM API called before NVSHMEM initialization has completed`.
+- **NVSHMEM peer disconnect**: Usually a networking misconfiguration. If deploying via Kubernetes, verify that every pod runs with `hostNetwork: true`, `securityContext.privileged: true` to access Infiniband.
+
+### Benchmarking
+
+- Use simulator flags `VLLM_MOE_ROUTING_SIMULATION_STRATEGY=uniform_random` and `VLLM_RANDOMIZE_DP_DUMMY_INPUTS=1` so token routing is balanced across EP ranks.
+
+- Increasing `VLLM_MOE_DP_CHUNK_SIZE` may increase throughput by increasing the maximum batch size for inter-rank token transfers. This may cause DeepEP  to throw `assert self.nvshmem_qp_depth >= (num_max_dispatch_tokens_per_rank + 1) * 2`, which can be fixed by increasing environment variable `NVSHMEM_QP_DEPTH`.
+
 ## Disaggregated Serving (Prefill/Decode Split)

 For production deployments requiring strict SLA guarantees for time-to-first-token and inter-token latency, disaggregated serving allows independent scaling of prefill and decode operations.
@ -273,3 +294,9 @@ except Exception as e:
    print(f"❌ Error during disaggregated serving: {e}")
    print("Check that both prefill and decode instances are running and accessible")
 ```
+
+### Benchmarking
+
+- To simulate the decode deployment of disaggregated serving, pass `--kv-transfer-config '{"kv_connector":"DecodeBenchConnector","kv_role":"kv_both"}'` to the `vllm serve` invocation. The connector populates KV cache with random values so decode can be profiled in isolation.
+
+- **CUDAGraph capture**: Use `--compilation_config '{"cudagraph_mode": "FULL_DECODE_ONLY"}'` to enable CUDA graph capture for decode only and save KV cache.
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@ -851,7 +851,7 @@ endpoints are compatible with both [Jina AI's re-rank API interface](https://jin
 [Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with
 popular open-source tools.

-Code example: [examples/pooling/score/jinaai_rerank_client.py](../../examples/pooling/score/jinaai_rerank_client.py)
+Code example: [examples/pooling/score/openai_reranker.py](../../examples/pooling/score/openai_reranker.py)

 #### Example Request

--- a/docs/usage/metrics.md
+++ b/docs/usage/metrics.md
@ -33,11 +33,19 @@ Then query the endpoint to get the latest metrics from the server:

 The following metrics are exposed:

-??? code
+## General Metrics

-    ```python
-    --8<-- "vllm/engine/metrics.py:metrics-definitions"
-    ```
+--8<-- "docs/generated/metrics/general.md"
+
+## Speculative Decoding Metrics
+
+--8<-- "docs/generated/metrics/spec_decode.md"
+
+## NIXL KV Connector Metrics
+
+--8<-- "docs/generated/metrics/nixl_connector.md"
+
+## Deprecation Policy

 Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
 but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@ -422,7 +422,7 @@ def parse_args():
    parser.add_argument(
        "--seed",
        type=int,
-        default=None,
+        default=0,
        help="Set the seed when initializing `vllm.LLM`.",
    )
    parser.add_argument(
--- a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
@ -30,7 +30,7 @@ def main():
        max_num_batched_tokens=64,
        max_num_seqs=16,
        kv_transfer_config=KVTransferConfig(
-            kv_connector="SharedStorageConnector",
+            kv_connector="ExampleConnector",
            kv_role="kv_both",
            kv_connector_extra_config={"shared_storage_path": "local_storage"},
        ),
--- a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
@ -26,7 +26,7 @@ def main():
        enforce_eager=True,
        gpu_memory_utilization=0.8,
        kv_transfer_config=KVTransferConfig(
-            kv_connector="SharedStorageConnector",
+            kv_connector="ExampleConnector",
            kv_role="kv_both",
            kv_connector_extra_config={"shared_storage_path": "local_storage"},
        ),
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@ -77,7 +77,7 @@ def parse_args():
    parser.add_argument(
        "--seed",
        type=int,
-        default=None,
+        default=0,
        help="Set the seed when initializing `vllm.LLM`.",
    )
    return parser.parse_args()
--- a/examples/offline_inference/kv_load_failure_recovery/README.md
+++ b/examples/offline_inference/kv_load_failure_recovery/README.md
@ -10,7 +10,7 @@ It demonstrates vLLM's ability to recover from KV load failures in both synchron
 - `decode_example.py` – performs the decode stage. Accepts:
    - `--simulate-failure`: simulates KV load failure using a custom connector.
    - `--async-load`: enables asynchronous KV loading mode.
- `rogue_shared_storage_connector.py` – defines `RogueSharedStorageConnector`, a subclass of `SharedStorageConnector`, that simulates missing or corrupted external KV blocks by failing to load blocks for the first decode request.
+- `load_recovery_example_connector.py` – defines `LoadRecoveryExampleConnector`, a subclass of `ExampleConnector`, that simulates missing or corrupted external KV blocks by failing to load blocks for the first decode request.
 - `run.sh` – orchestrates the test: runs the prefill stage, then three decode stages:
    1. Normal decode (baseline).
    2. Decode with simulated sync KV load failure.
@ -20,7 +20,7 @@ It demonstrates vLLM's ability to recover from KV load failures in both synchron

 ## How It Works

- The test dynamically loads `RogueSharedStorageConnector` via `KVTransferConfig.kv_connector_module_path`, enabling controlled simulation of load failures without modifying the original connector.
+- The test dynamically loads `LoadRecoveryExampleConnector` via `KVTransferConfig.kv_connector_module_path`, enabling controlled simulation of load failures without modifying the original connector.
 - The decode stages that simulate failure are expected to trigger recovery logic in vLLM, resulting in the same output as the baseline decode.
 - If recovery fails, the script prints a unified diff of the output mismatch and exits with error.

--- a/examples/offline_inference/kv_load_failure_recovery/decode_example.py
+++ b/examples/offline_inference/kv_load_failure_recovery/decode_example.py
@ -35,13 +35,13 @@ def main():

    if args.simulate_failure:
        ktc = KVTransferConfig(
-            kv_connector="RogueSharedStorageConnector",
+            kv_connector="LoadRecoveryExampleConnector",
            kv_role="kv_both",
            kv_connector_extra_config={
                "shared_storage_path": "local_storage",
                "async_load": args.async_load,
            },
-            kv_connector_module_path="rogue_shared_storage_connector",
+            kv_connector_module_path="load_recovery_example_connector",
        )
        out_file = (
            "async_decode_recovered_output.txt"
@ -50,7 +50,7 @@ def main():
        )
    else:
        ktc = KVTransferConfig(
-            kv_connector="SharedStorageConnector",
+            kv_connector="ExampleConnector",
            kv_role="kv_both",
            kv_connector_extra_config={
                "shared_storage_path": "local_storage",
--- a/examples/offline_inference/kv_load_failure_recovery/load_recovery_example_connector.py
+++ b/examples/offline_inference/kv_load_failure_recovery/load_recovery_example_connector.py
@ -10,9 +10,9 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
    KVConnectorMetadata,
    KVConnectorRole,
 )
-from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (
-    SharedStorageConnector,
-    SharedStorageConnectorMetadata,
+from vllm.distributed.kv_transfer.kv_connector.v1.example_connector import (
+    ExampleConnector,
+    ExampleConnectorMetadata,
 )
 from vllm.forward_context import ForwardContext
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks
@ -26,15 +26,15 @@ logging.basicConfig(level=logging.INFO)


@dataclass
-class RogueSharedStorageConnectorMetadata(SharedStorageConnectorMetadata):
+class LoadRecoveryExampleConnectorMetadata(ExampleConnectorMetadata):
    req_to_block_ids: dict[str, set[int]] = field(default_factory=dict)

    @classmethod
-    def from_base(cls, base: SharedStorageConnectorMetadata):
+    def from_base(cls, base: ExampleConnectorMetadata):
        return cls(requests=base.requests)


-class RogueSharedStorageConnector(SharedStorageConnector):
+class LoadRecoveryExampleConnector(ExampleConnector):
    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
        super().__init__(vllm_config=vllm_config, role=role)
        self._async_load = vllm_config.kv_transfer_config.get_from_extra_config(
@ -45,7 +45,7 @@ class RogueSharedStorageConnector(SharedStorageConnector):
        self._req_to_block_ids: dict[str, list[int]] = dict()

    def bind_connector_metadata(self, connector_metadata: KVConnectorMetadata) -> None:
-        assert isinstance(connector_metadata, RogueSharedStorageConnectorMetadata)
+        assert isinstance(connector_metadata, LoadRecoveryExampleConnectorMetadata)
        index, failed_request = next(
            (
                (i, x)
@ -84,7 +84,7 @@ class RogueSharedStorageConnector(SharedStorageConnector):
    ) -> tuple[set[str] | None, set[str] | None]:
        if self._async_load:
            meta = self._get_connector_metadata()
-            assert isinstance(meta, RogueSharedStorageConnectorMetadata)
+            assert isinstance(meta, LoadRecoveryExampleConnectorMetadata)
            if meta.req_to_block_ids:
                return None, set(meta.req_to_block_ids)

@ -126,9 +126,9 @@ class RogueSharedStorageConnector(SharedStorageConnector):
    ) -> KVConnectorMetadata:
        if not self._async_load:
            base = super().build_connector_meta(scheduler_output)
-            meta = RogueSharedStorageConnectorMetadata.from_base(base)
+            meta = LoadRecoveryExampleConnectorMetadata.from_base(base)
        else:
-            meta = RogueSharedStorageConnectorMetadata()
+            meta = LoadRecoveryExampleConnectorMetadata()
            if self._requests_need_load:
                for req_id, request in self._requests_need_load.items():
                    meta.add_request(
--- a/examples/offline_inference/kv_load_failure_recovery/prefill_example.py
+++ b/examples/offline_inference/kv_load_failure_recovery/prefill_example.py
@ -26,7 +26,7 @@ def main():
        enforce_eager=True,
        gpu_memory_utilization=0.8,
        kv_transfer_config=KVTransferConfig(
-            kv_connector="SharedStorageConnector",
+            kv_connector="ExampleConnector",
            kv_role="kv_both",
            kv_connector_extra_config={"shared_storage_path": "local_storage"},
        ),
--- a/examples/offline_inference/qwen2_5_omni/only_thinker.py
+++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py
@ -158,7 +158,7 @@ def parse_args():
    parser.add_argument(
        "--seed",
        type=int,
-        default=None,
+        default=0,
        help="Set the seed when initializing `vllm.LLM`.",
    )

--- a/examples/offline_inference/qwen3_omni/only_thinker.py
+++ b/examples/offline_inference/qwen3_omni/only_thinker.py
@ -158,7 +158,7 @@ def parse_args():
    parser.add_argument(
        "--seed",
        type=int,
-        default=None,
+        default=0,
        help="Set the seed when initializing `vllm.LLM`.",
    )

--- a/Show More
+++ b/Show More